diff --git a/Android.mk b/Android.mk
index 69e0d33f1aa..ed160fb3d0e 100644
--- a/Android.mk
+++ b/Android.mk
@@ -45,8 +45,6 @@ endif
 MESA_COMMON_MK := $(MESA_TOP)/Android.common.mk
 MESA_PYTHON2 := python
 
-DRM_GRALLOC_TOP := hardware/drm_gralloc
-
 classic_drivers := i915 i965
 gallium_drivers := swrast freedreno i915g ilo nouveau r300g r600g radeonsi vmwgfx vc4
 
@@ -91,8 +89,7 @@ SUBDIRS := \
 	src/glsl \
 	src/mesa \
 	src/util \
-	src/egl/main \
-	src/egl/drivers/dri2 \
+	src/egl \
 	src/mesa/drivers/dri
 
 ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
diff --git a/VERSION b/VERSION
index 1edd8fc00e5..2b1181ddc3f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-10.7.0-devel
+11.0.0-devel
diff --git a/bin/bugzilla_mesa.sh b/bin/bugzilla_mesa.sh
index 491ca0e7c0b..0cff4261f75 100755
--- a/bin/bugzilla_mesa.sh
+++ b/bin/bugzilla_mesa.sh
@@ -15,17 +15,14 @@
 # $ DRYRUN=yes bin/bugzilla_mesa.sh mesa-9.0.2..mesa-9.0.3 | wc -l
 
 
-# regex pattern: trim before url
-trim_before='s/.*\(http\)/\1/'
+# regex pattern: trim before bug number
+trim_before='s/.*show_bug.cgi?id=\([0-9]*\).*/\1/'
 
-# regex pattern: trim after url
-trim_after='s/\(show_bug.cgi?id=[0-9]*\).*/\1/'
-
-# regex pattern: always use https
-use_https='s/http:/https:/'
+# regex pattern: reconstruct the url
+use_after='s,^,https://bugs.freedesktop.org/show_bug.cgi?id=,'
 
 # extract fdo urls from commit log
-urls=$(git log $* | grep 'bugs.freedesktop.org/show_bug' | sed -e $trim_before -e $trim_after -e $use_https | sort | uniq)
+urls=$(git log $* | grep 'bugs.freedesktop.org/show_bug' | sed -e $trim_before | sort -n -u | sed -e $use_after)
 
 # if DRYRUN is set to "yes", simply print the URLs and don't fetch the
 # details from fdo bugzilla.
diff --git a/configure.ac b/configure.ac
index e78a4ba6325..74e13b3fcb7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -44,7 +44,7 @@ AC_INIT([Mesa], [MESA_VERSION],
 AC_CONFIG_AUX_DIR([bin])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CANONICAL_SYSTEM
-AM_INIT_AUTOMAKE([foreign tar-ustar dist-xz])
+AM_INIT_AUTOMAKE([foreign tar-ustar dist-xz subdir-objects])
 
 dnl We only support native Windows builds (MinGW/MSVC) through SCons.
 case "$host_os" in
@@ -64,13 +64,16 @@ m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
 dnl Set internal versions
 OSMESA_VERSION=8
 AC_SUBST([OSMESA_VERSION])
+OPENCL_VERSION=1
+AC_SUBST([OPENCL_VERSION])
 
 dnl Versions for external dependencies
-LIBDRM_REQUIRED=2.4.38
+LIBDRM_REQUIRED=2.4.60
 LIBDRM_RADEON_REQUIRED=2.4.56
-LIBDRM_INTEL_REQUIRED=2.4.60
+LIBDRM_AMDGPU_REQUIRED=2.4.63
+LIBDRM_INTEL_REQUIRED=2.4.61
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
-LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41"
+LIBDRM_NOUVEAU_REQUIRED=2.4.62
 LIBDRM_FREEDRENO_REQUIRED=2.4.57
 DRI2PROTO_REQUIRED=2.6
 DRI3PROTO_REQUIRED=1.0
@@ -79,7 +82,7 @@ LIBUDEV_REQUIRED=151
 GLPROTO_REQUIRED=1.4.14
 LIBOMXIL_BELLAGIO_REQUIRED=0.0
 LIBVA_REQUIRED=0.35.0
-VDPAU_REQUIRED=0.4.1
+VDPAU_REQUIRED=1.1
 WAYLAND_REQUIRED=1.2.0
 XCB_REQUIRED=1.9.3
 XCBDRI2_REQUIRED=1.8
@@ -205,11 +208,14 @@ AX_GCC_BUILTIN([__builtin_popcount])
 AX_GCC_BUILTIN([__builtin_popcountll])
 AX_GCC_BUILTIN([__builtin_unreachable])
 
+AX_GCC_FUNC_ATTRIBUTE([const])
 AX_GCC_FUNC_ATTRIBUTE([flatten])
 AX_GCC_FUNC_ATTRIBUTE([format])
 AX_GCC_FUNC_ATTRIBUTE([malloc])
 AX_GCC_FUNC_ATTRIBUTE([packed])
+AX_GCC_FUNC_ATTRIBUTE([pure])
 AX_GCC_FUNC_ATTRIBUTE([unused])
+AX_GCC_FUNC_ATTRIBUTE([warn_unused_result])
 
 AM_CONDITIONAL([GEN_ASM_OFFSETS], test "x$GEN_ASM_OFFSETS" = xyes)
 
@@ -230,7 +236,7 @@ _SAVE_LDFLAGS="$LDFLAGS"
 _SAVE_CPPFLAGS="$CPPFLAGS"
 
 dnl Compiler macros
-DEFINES=""
+DEFINES="-D__STDC_LIMIT_MACROS"
 AC_SUBST([DEFINES])
 case "$host_os" in
 linux*|*-gnu*|gnu*)
@@ -281,6 +287,9 @@ if test "x$GCC" = xyes; then
     # Work around aliasing bugs - developers should comment this out
     CFLAGS="$CFLAGS -fno-strict-aliasing"
 
+    # We don't want floating-point math functions to set errno or trap
+    CFLAGS="$CFLAGS -fno-math-errno -fno-trapping-math"
+
     # gcc's builtin memcmp is slower than glibc's
     # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
     CFLAGS="$CFLAGS -fno-builtin-memcmp"
@@ -651,6 +660,7 @@ fi
 AC_CHECK_HEADER([xlocale.h], [DEFINES="$DEFINES -DHAVE_XLOCALE_H"])
 AC_CHECK_HEADER([sys/sysctl.h], [DEFINES="$DEFINES -DHAVE_SYS_SYSCTL_H"])
 AC_CHECK_FUNC([strtof], [DEFINES="$DEFINES -DHAVE_STRTOF"])
+AC_CHECK_FUNC([mkostemp], [DEFINES="$DEFINES -DHAVE_MKOSTEMP"])
 
 dnl Check to see if dlopen is in default libraries (like Solaris, which
 dnl has it in libc), or if libdl is needed to get it.
@@ -910,6 +920,13 @@ fi
 AM_CONDITIONAL(HAVE_DRI_GLX, test "x$enable_glx" = xyes -a \
                                   "x$enable_dri" = xyes)
 
+# Check for libdrm
+PKG_CHECK_MODULES([LIBDRM], [libdrm >= $LIBDRM_REQUIRED],
+                  [have_libdrm=yes], [have_libdrm=no])
+if test "x$have_libdrm" = xyes; then
+	DEFINES="$DEFINES -DHAVE_LIBDRM"
+fi
+
 # Select which platform-dependent DRI code gets built
 case "$host_os" in
 darwin*)
@@ -922,8 +939,8 @@ esac
 
 AM_CONDITIONAL(HAVE_DRICOMMON, test "x$enable_dri" = xyes )
 AM_CONDITIONAL(HAVE_DRISW, test "x$enable_dri" = xyes )
-AM_CONDITIONAL(HAVE_DRI2, test "x$enable_dri" = xyes -a "x$dri_platform" = xdrm )
-AM_CONDITIONAL(HAVE_DRI3, test "x$enable_dri3" = xyes -a "x$dri_platform" = xdrm )
+AM_CONDITIONAL(HAVE_DRI2, test "x$enable_dri" = xyes -a "x$dri_platform" = xdrm -a "x$have_libdrm" = xyes )
+AM_CONDITIONAL(HAVE_DRI3, test "x$enable_dri3" = xyes -a "x$dri_platform" = xdrm -a "x$have_libdrm" = xyes )
 AM_CONDITIONAL(HAVE_APPLEDRI, test "x$enable_dri" = xyes -a "x$dri_platform" = xapple )
 
 AC_ARG_ENABLE([shared-glapi],
@@ -952,11 +969,9 @@ dnl
 dnl Driver specific build directories
 dnl
 
-case "x$enable_glx$enable_xlib_glx" in
-xyesyes)
+if test -n "$with_gallium_drivers" -a "x$enable_glx$enable_xlib_glx" = xyesyes; then
     NEED_WINSYS_XLIB="yes"
-    ;;
-esac
+fi
 
 if test "x$enable_dri" = xyes; then
     enable_gallium_loader="$enable_shared_pipe_drivers"
@@ -1111,13 +1126,6 @@ if test "x$with_sha1" = "x"; then
 fi
 AM_CONDITIONAL([ENABLE_SHADER_CACHE], [test x$enable_shader_cache = xyes])
 
-# Check for libdrm
-PKG_CHECK_MODULES([LIBDRM], [libdrm >= $LIBDRM_REQUIRED],
-                  [have_libdrm=yes], [have_libdrm=no])
-if test "x$have_libdrm" = xyes; then
-	DEFINES="$DEFINES -DHAVE_LIBDRM"
-fi
-
 case "$host_os" in
 linux*)
     need_pci_id=yes ;;
@@ -1357,7 +1365,7 @@ if test "x$enable_dri" = xyes; then
         fi
         ;;
     darwin*)
-        DEFINES="$DEFINES -DGLX_ALIAS_UNSUPPORTED"
+        DEFINES="$DEFINES -DGLX_ALIAS_UNSUPPORTED -DBUILDING_MESA"
         if test "x$with_dri_drivers" = "xyes"; then
             with_dri_drivers="swrast"
         fi
@@ -1378,26 +1386,6 @@ if test "x$enable_dri" = xyes; then
                      [AC_MSG_ERROR([Expat library required for DRI not found])])
          EXPAT_LIBS="-lexpat"])
 
-    DRICOMMON_NEED_LIBDRM=no
-    # If we are building any DRI driver other than swrast.
-    if test -n "$with_dri_drivers"; then
-        if test "x$with_dri_drivers" != xswrast; then
-            # ... libdrm is required
-            if test "x$have_libdrm" != xyes; then
-                AC_MSG_ERROR([DRI drivers requires libdrm >= $LIBDRM_REQUIRED])
-            fi
-            DRICOMMON_NEED_LIBDRM=yes
-        fi
-    fi
-
-    # If we're building any gallium DRI driver other than swrast
-    if test -n "$with_gallium_drivers" -a "x$DRICOMMON_NEED_LIBDRM" = xno; then
-        if test "x$with_gallium_drivers" != xswrast; then
-            # ... build a libdrm aware dricommon
-            DRICOMMON_NEED_LIBDRM=yes
-        fi
-    fi
-
     # put all the necessary libs together
     DRI_LIB_DEPS="$DRI_LIB_DEPS $SELINUX_LIBS $LIBDRM_LIBS $EXPAT_LIBS -lm $PTHREAD_LIBS $DLOPEN_LIBS"
 fi
@@ -1425,7 +1413,7 @@ if test -n "$with_dri_drivers"; then
             ;;
         xnouveau)
             HAVE_NOUVEAU_DRI=yes;
-            PKG_CHECK_MODULES([NOUVEAU], [libdrm_nouveau >= $LIBDRM_NVVIEUX_REQUIRED])
+            PKG_CHECK_MODULES([NVVIEUX], [libdrm_nouveau >= $LIBDRM_NVVIEUX_REQUIRED])
             ;;
         xradeon)
             HAVE_RADEON_DRI=yes;
@@ -1765,6 +1753,9 @@ egl_platforms=`IFS=', '; echo $with_egl_platforms`
 for plat in $egl_platforms; do
 	case "$plat" in
 	wayland)
+		test "x$have_libdrm" != xyes &&
+			AC_MSG_ERROR([EGL platform wayland requires libdrm >= $LIBDRM_REQUIRED])
+
 		PKG_CHECK_MODULES([WAYLAND], [wayland-client >= $WAYLAND_REQUIRED wayland-server >= $WAYLAND_REQUIRED])
 
 		if test "x$WAYLAND_SCANNER" = x; then
@@ -1788,9 +1779,6 @@ for plat in $egl_platforms; do
 			AC_MSG_ERROR([EGL platform surfaceless requires libdrm >= $LIBDRM_REQUIRED])
 		;;
 
-	android|gdi|null)
-		;;
-
 	*)
 		AC_MSG_ERROR([EGL platform '$plat' does not exist])
 		;;
@@ -1811,9 +1799,6 @@ else
     EGL_NATIVE_PLATFORM="_EGL_INVALID_PLATFORM"
 fi
 
-if echo "$egl_platforms" | grep -q 'x11'; then
-    NEED_WINSYS_XLIB=yes
-fi
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_X11, echo "$egl_platforms" | grep -q 'x11')
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_WAYLAND, echo "$egl_platforms" | grep -q 'wayland')
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_DRM, echo "$egl_platforms" | grep -q 'drm')
@@ -2127,6 +2112,7 @@ if test -n "$with_gallium_drivers"; then
         xradeonsi)
             HAVE_GALLIUM_RADEONSI=yes
             PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
+            PKG_CHECK_MODULES([AMDGPU], [libdrm_amdgpu >= $LIBDRM_AMDGPU_REQUIRED])
             gallium_require_drm "radeonsi"
             gallium_require_drm_loader
             radeon_llvm_check "radeonsi"
@@ -2237,31 +2223,15 @@ AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers"
 #       use by XA tracker in particular, but could be used in any case
 #       where communication with xserver is not desired).
 if test "x$enable_gallium_loader" = xyes; then
-    if test "x$NEED_WINSYS_XLIB" = xyes; then
-        GALLIUM_PIPE_LOADER_DEFINES="$GALLIUM_PIPE_LOADER_DEFINES -DHAVE_PIPE_LOADER_XLIB"
-    fi
-
     if test "x$enable_dri" = xyes; then
         GALLIUM_PIPE_LOADER_DEFINES="$GALLIUM_PIPE_LOADER_DEFINES -DHAVE_PIPE_LOADER_DRI"
     fi
 
     if test "x$enable_gallium_drm_loader" = xyes; then
         GALLIUM_PIPE_LOADER_DEFINES="$GALLIUM_PIPE_LOADER_DEFINES -DHAVE_PIPE_LOADER_DRM"
-        PKG_CHECK_MODULES([GALLIUM_PIPE_LOADER_XCB], [xcb xcb-dri2],
-                          pipe_loader_have_xcb=yes, pipe_loader_have_xcb=no)
-        if test "x$pipe_loader_have_xcb" = xyes; then
-            GALLIUM_PIPE_LOADER_CLIENT_DEFINES="$GALLIUM_PIPE_LOADER_CLIENT_DEFINES -DHAVE_PIPE_LOADER_XCB"
-            GALLIUM_PIPE_LOADER_CLIENT_LIBS="$GALLIUM_PIPE_LOADER_CLIENT_LIBS $GALLIUM_PIPE_LOADER_XCB_LIBS $LIBDRM_LIBS"
-        fi
     fi
 
-    GALLIUM_PIPE_LOADER_CLIENT_DEFINES="$GALLIUM_PIPE_LOADER_CLIENT_DEFINES $GALLIUM_PIPE_LOADER_DEFINES"
-    GALLIUM_PIPE_LOADER_CLIENT_LIBS="$GALLIUM_PIPE_LOADER_CLIENT_LIBS $GALLIUM_PIPE_LOADER_LIBS"
-
     AC_SUBST([GALLIUM_PIPE_LOADER_DEFINES])
-    AC_SUBST([GALLIUM_PIPE_LOADER_LIBS])
-    AC_SUBST([GALLIUM_PIPE_LOADER_CLIENT_DEFINES])
-    AC_SUBST([GALLIUM_PIPE_LOADER_CLIENT_LIBS])
 fi
 
 AM_CONDITIONAL(HAVE_I915_DRI, test x$HAVE_I915_DRI = xyes)
@@ -2288,7 +2258,6 @@ fi
 
 AC_SUBST([ELF_LIB])
 
-AM_CONDITIONAL(DRICOMMON_NEED_LIBDRM, test "x$DRICOMMON_NEED_LIBDRM" = xyes)
 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_X11_DRIVER, test "x$enable_xlib_glx" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
@@ -2348,8 +2317,7 @@ CXXFLAGS="$CXXFLAGS $USER_CXXFLAGS"
 dnl Substitute the config
 AC_CONFIG_FILES([Makefile
 		src/Makefile
-		src/egl/drivers/dri2/Makefile
-		src/egl/main/Makefile
+		src/egl/Makefile
 		src/egl/main/egl.pc
 		src/egl/wayland/wayland-drm/Makefile
 		src/egl/wayland/wayland-egl/Makefile
@@ -2388,6 +2356,7 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/targets/libgl-xlib/Makefile
 		src/gallium/targets/omx/Makefile
 		src/gallium/targets/opencl/Makefile
+		src/gallium/targets/opencl/mesa.icd
 		src/gallium/targets/osmesa/Makefile
 		src/gallium/targets/osmesa/osmesa.pc
 		src/gallium/targets/pipe-loader/Makefile
@@ -2403,6 +2372,7 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/winsys/intel/drm/Makefile
 		src/gallium/winsys/nouveau/drm/Makefile
 		src/gallium/winsys/radeon/drm/Makefile
+		src/gallium/winsys/amdgpu/drm/Makefile
 		src/gallium/winsys/svga/drm/Makefile
 		src/gallium/winsys/sw/dri/Makefile
 		src/gallium/winsys/sw/kms-dri/Makefile
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 220bcc8742f..54c0c5aa6a8 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -92,43 +92,43 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
   GL_ARB_vertex_type_2_10_10_10_rev                     DONE ()
 
 
-GL 4.0, GLSL 4.00:
+GL 4.0, GLSL 4.00 --- all DONE: nvc0, radeonsi
 
-  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_gpu_shader5                                   DONE (i965, nvc0)
+  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_draw_indirect                                 DONE (i965, r600, llvmpipe, softpipe)
+  GL_ARB_gpu_shader5                                   DONE (i965)
   - 'precise' qualifier                                DONE
   - Dynamically uniform sampler array indices          DONE (r600, softpipe)
   - Dynamically uniform UBO array indices              DONE (r600)
   - Implicit signed -> unsigned conversions            DONE
   - Fused multiply-add                                 DONE ()
-  - Packing/bitfield/conversion functions              DONE (r600, radeonsi, softpipe)
-  - Enhanced textureGather                             DONE (r600, radeonsi, softpipe)
+  - Packing/bitfield/conversion functions              DONE (r600, softpipe)
+  - Enhanced textureGather                             DONE (r600, softpipe)
   - Geometry shader instancing                         DONE (r600, llvmpipe, softpipe)
   - Geometry shader multiple streams                   DONE ()
-  - Enhanced per-sample shading                        DONE (r600, radeonsi)
+  - Enhanced per-sample shading                        DONE (r600)
   - Interpolation functions                            DONE (r600)
   - New overload resolution rules                      DONE
-  GL_ARB_gpu_shader_fp64                               DONE (nvc0, softpipe)
-  GL_ARB_sample_shading                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_shader_subroutine                             started (Dave)
-  GL_ARB_tessellation_shader                           started (Chris, Ilia)
-  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_gather                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_query_lod                             DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_transform_feedback2                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_transform_feedback3                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_gpu_shader_fp64                               DONE (llvmpipe, softpipe)
+  GL_ARB_sample_shading                                DONE (i965, nv50, r600)
+  GL_ARB_shader_subroutine                             DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_tessellation_shader                           DONE ()
+  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, r600, llvmpipe, softpipe)
+  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_texture_gather                                DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_texture_query_lod                             DONE (i965, nv50, r600)
+  GL_ARB_transform_feedback2                           DONE (i965, nv50, r600, llvmpipe, softpipe)
+  GL_ARB_transform_feedback3                           DONE (i965, nv50, r600, llvmpipe, softpipe)
 
 
-GL 4.1, GLSL 4.10:
+GL 4.1, GLSL 4.10 --- all DONE: nvc0, radeonsi
 
-  GL_ARB_ES2_compatibility                             DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_ES2_compatibility                             DONE (i965, nv50, r600, llvmpipe, softpipe)
   GL_ARB_get_program_binary                            DONE (0 binary formats)
   GL_ARB_separate_shader_objects                       DONE (all drivers)
-  GL_ARB_shader_precision                              started (Micah)
-  GL_ARB_vertex_attrib_64bit                           DONE (nvc0, softpipe)
-  GL_ARB_viewport_array                                DONE (i965, nv50, nvc0, r600, llvmpipe)
+  GL_ARB_shader_precision                              DONE (all drivers that support GLSL 4.10)
+  GL_ARB_vertex_attrib_64bit                           DONE (llvmpipe, softpipe)
+  GL_ARB_viewport_array                                DONE (i965, nv50, r600, llvmpipe)
 
 
 GL 4.2, GLSL 4.20:
@@ -139,7 +139,7 @@ GL 4.2, GLSL 4.20:
   GL_ARB_texture_storage                               DONE (all drivers)
   GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_base_instance                                 DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_shader_image_load_store                       in progress (curro)
+  GL_ARB_shader_image_load_store                       DONE (i965)
   GL_ARB_conservative_depth                            DONE (all drivers that support GLSL 1.30)
   GL_ARB_shading_language_420pack                      DONE (all drivers that support GLSL 1.30)
   GL_ARB_shading_language_packing                      DONE (all drivers)
@@ -156,7 +156,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_copy_image                                    DONE (i965) (gallium - in progress, VMware)
   GL_KHR_debug                                         DONE (all drivers)
   GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                       DONE (nv50, nvc0, r600, llvmpipe)
+  GL_ARB_fragment_layer_viewport                       DONE (nv50, nvc0, r600, radeonsi, llvmpipe)
   GL_ARB_framebuffer_no_attachments                    DONE (i965)
   GL_ARB_internalformat_query2                         not started
   GL_ARB_invalidate_subdata                            DONE (all drivers)
@@ -189,20 +189,11 @@ GL 4.5, GLSL 4.50:
 
   GL_ARB_ES3_1_compatibility                           not started
   GL_ARB_clip_control                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, llvmpipe, softpipe)
+  GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_cull_distance                                 in progress (Tobias)
-  GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600)
+  GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600, radeonsi)
   GL_ARB_direct_state_access                           DONE (all drivers)
-  - Transform Feedback object                          DONE
-  - Buffer object                                      DONE
-  - Framebuffer object                                 DONE
-  - Renderbuffer object                                DONE
-  - Texture object                                     DONE
-  - Vertex array object                                DONE
-  - Sampler object                                     DONE
-  - Program Pipeline object                            DONE
-  - Query object                                       DONE (will require changes when GL_ARB_query_buffer_object lands)
-  GL_ARB_get_texture_sub_image                         started (Brian Paul)
+  GL_ARB_get_texture_sub_image                         DONE (all drivers)
   GL_ARB_shader_texture_image_samples                  not started
   GL_ARB_texture_barrier                               DONE (nv50, nvc0, r600, radeonsi)
   GL_KHR_context_flush_control                         DONE (all - but needs GLX/EXT extension to be useful)
diff --git a/docs/egl.html b/docs/egl.html
index 3ab1a6018fd..bc21c6c4894 100644
--- a/docs/egl.html
+++ b/docs/egl.html
@@ -88,10 +88,10 @@ types such as <code>EGLNativeDisplayType</code> or
 <code>EGLNativeWindowType</code> defined for.</p>
 
 <p>The available platforms are <code>x11</code>, <code>drm</code>,
-<code>wayland</code>, <code>null</code>, <code>android</code>,
-<code>haiku</code>, and <code>gdi</code>.  The <code>android</code> platform
+<code>wayland</code>, <code>surfaceless</code>, <code>android</code>,
+and <code>haiku</code>.  The <code>android</code> platform
 can only be built as a system component, part of AOSP, while the
-<code>haiku</code> and <code>gdi</code> platforms can only be built with SCons.
+<code>haiku</code> platform can only be built with SCons.
 Unless for special needs, the build system should
 select the right platforms automatically.</p>
 
diff --git a/docs/index.html b/docs/index.html
index 80c6e03e3f1..b9e6148914e 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,40 @@
 
 <h1>News</h1>
 
+<h2>August 11 2015</h2>
+<p>
+<a href="relnotes/10.6.4.html">Mesa 10.6.4</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>July 26 2015</h2>
+<p>
+<a href="relnotes/10.6.3.html">Mesa 10.6.3</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>July 11 2015</h2>
+<p>
+<a href="relnotes/10.6.2.html">Mesa 10.6.2</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>July 04, 2015</h2>
+<p>
+<a href="relnotes/10.5.9.html">Mesa 10.5.9</a> is released.
+This is a bug-fix release.
+<br>
+NOTE: It is anticipated that 10.5.9 will be the final release in the 10.5
+series. Users of 10.5 are encouraged to migrate to the 10.6 series in order
+to obtain future fixes.
+</p>
+
+<h2>June 29, 2015</h2>
+<p>
+<a href="relnotes/10.6.1.html">Mesa 10.6.1</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>June 20, 2015</h2>
 <p>
 <a href="relnotes/10.5.8.html">Mesa 10.5.8</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 5fd80025a39..39e7f61e792 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,11 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/10.6.4.html">10.6.4 release notes</a>
+<li><a href="relnotes/10.6.3.html">10.6.3 release notes</a>
+<li><a href="relnotes/10.6.2.html">10.6.2 release notes</a>
+<li><a href="relnotes/10.5.9.html">10.5.9 release notes</a>
+<li><a href="relnotes/10.6.1.html">10.6.1 release notes</a>
 <li><a href="relnotes/10.5.8.html">10.5.8 release notes</a>
 <li><a href="relnotes/10.6.0.html">10.6.0 release notes</a>
 <li><a href="relnotes/10.5.7.html">10.5.7 release notes</a>
diff --git a/docs/relnotes/10.5.9.html b/docs/relnotes/10.5.9.html
new file mode 100644
index 00000000000..a1d11c3b70d
--- /dev/null
+++ b/docs/relnotes/10.5.9.html
@@ -0,0 +1,140 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.5.9 Release Notes / July 04, 2015</h1>
+
+<p>
+Mesa 10.5.9 is a bug fix release which fixes bugs found since the 10.5.8 release.
+</p>
+<p>
+Mesa 10.5.9 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+0c081b59572ee9732e7438d34adc3817fe8cc8d4b58abc0e71fd4b4c904945cb  mesa-10.5.9.tar.gz
+71c69f31d3dbc35cfa79950e58a01d27030378d8c7ef1259a0b31d4d0487f4ec  mesa-10.5.9.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84225">Bug 84225</a> - Allow constant-index-expression sampler array indexing with GLSL-ES &lt; 300</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88999">Bug 88999</a> - [SKL] Compiz crashes after opening unity dash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89118">Bug 89118</a> - [SKL Bisected]many Ogles3conform cases core dumped</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90537">Bug 90537</a> - radeonsi bo/va conflict on RADEON_GEM_VA (rscreen-&gt;ws-&gt;buffer_from_handle returns NULL)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90839">Bug 90839</a> - [10.5.5/10.6 regression, bisected] PBO glDrawPixels no longer using blit fastpath</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90873">Bug 90873</a> - Kernel hang, TearFree On, Mate desktop environment</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91056">Bug 91056</a> - The Bard's Tale (2005, native)  has rendering issues</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91117">Bug 91117</a> - Nimbus (running in wine) has rendering issues, objects are semi-transparent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91124">Bug 91124</a> - Civilization V (in Wine) has rendering issues: text missing, menu bar corrupted</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Ben Widawsky (2):</p>
+<ul>
+  <li>i965/gen9: Implement Push Constant Buffer workaround</li>
+  <li>i965/skl: Use 1 register for uniform pull constant payload</li>
+</ul>
+
+<p>Boyan Ding (1):</p>
+<ul>
+  <li>egl/x11: Remove duplicate call to dri2_x11_add_configs_for_visuals</li>
+</ul>
+
+<p>Chris Wilson (3):</p>
+<ul>
+  <li>i965: Fix HW blitter pitch limits</li>
+  <li>i915: Blit RGBX&lt;-&gt;RGBA drawpixels</li>
+  <li>i965: Export format comparison for blitting between miptrees</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: Add sha256sums for the 10.5.8 release</li>
+  <li>configure: warn about shared_glapi &amp; xlib-glx only when both are set</li>
+  <li>configure: error out when building backend-less libEGL</li>
+  <li>configure: error out when building libEGL without shared-glapi</li>
+  <li>gbm: do not (over)link against libglapi.so</li>
+  <li>Update version to 10.5.9</li>
+</ul>
+
+<p>Frank Henigman (1):</p>
+<ul>
+  <li>gbm: dlopen libglapi so gbm_create_device works</li>
+</ul>
+
+<p>Ilia Mirkin (8):</p>
+<ul>
+  <li>glsl: add version checks to conditionals for builtin variable enablement</li>
+  <li>mesa: add GL_PROGRAM_PIPELINE support in KHR_debug calls</li>
+  <li>glsl: binding point is a texture unit, which is a combined space</li>
+  <li>nvc0: always put all tfb bufs into bufctx</li>
+  <li>nv50,nvc0: make sure to pushbuf_refn before putting bo into pushbuf_data</li>
+  <li>nv50/ir: propagate modifier to right arg when const-folding mad</li>
+  <li>nv50/ir: fix emission of address reg in 3rd source</li>
+  <li>nv50/ir: copy joinAt when splitting both before and after</li>
+</ul>
+
+<p>Mario Kleiner (2):</p>
+<ul>
+  <li>nouveau: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads.</li>
+  <li>winsys/radeon: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads.</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>winsys/radeon: Unmap GPU VM address range when destroying BO</li>
+</ul>
+
+<p>Tapani Pälli (6):</p>
+<ul>
+  <li>glsl: Allow dynamic sampler array indexing with GLSL ES &lt; 3.00</li>
+  <li>mesa/glsl: new compiler option EmitNoIndirectSampler</li>
+  <li>i915: use EmitNoIndirectSampler</li>
+  <li>mesa/st: use EmitNoIndirectSampler if !ARB_gpu_shader5</li>
+  <li>i965: use EmitNoIndirectSampler for gen &lt; 7</li>
+  <li>glsl: validate sampler array indexing for 'constant-index-expression'</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.6.1.html b/docs/relnotes/10.6.1.html
new file mode 100644
index 00000000000..f197b0f3a42
--- /dev/null
+++ b/docs/relnotes/10.6.1.html
@@ -0,0 +1,104 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.1 Release Notes / June 29, 2015</h1>
+
+<p>
+Mesa 10.6.1 is a bug fix release which fixes bugs found since the 10.6.0 release.
+</p>
+<p>
+Mesa 10.6.1 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+b4cccd4d0eabcc2bca00c3175d3ad88fdda57ffdb883a7998525b873a21fe607  mesa-10.6.1.tar.gz
+6c80a2b647e57c85dc36e609d9aed17f878f0d8e0cf9ace86d14cf604101e1eb  mesa-10.6.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90347">Bug 90347</a> - [NVE0+] Failure to insert texbar under some circumstances (causing bad colors in Terasology)</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Anuj Phogat (4):</p>
+<ul>
+  <li>mesa: Handle integer formats in need_rgb_to_luminance_conversion()</li>
+  <li>mesa: Use helper function need_rgb_to_luminance_conversion()</li>
+  <li>mesa: Turn need_rgb_to_luminance_conversion() in to a global function</li>
+  <li>meta: Abort meta path if ReadPixels need rgb to luminance conversion</li>
+</ul>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/gen9: Implement Push Constant Buffer workaround</li>
+</ul>
+
+<p>Boyan Ding (2):</p>
+<ul>
+  <li>egl/x11: Set version of swrastLoader to 2</li>
+  <li>egl/x11: Remove duplicate call to dri2_x11_add_configs_for_visuals</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: Add sha256sums for the 10.6.0 release</li>
+  <li>configure: warn about shared_glapi &amp; xlib-glx only when both are set</li>
+  <li>configure: error out when building backend-less libEGL</li>
+  <li>configure: error out when building libEGL without shared-glapi</li>
+  <li>gbm: do not (over)link against libglapi.so</li>
+  <li>Update version to 10.6.1</li>
+</ul>
+
+<p>Frank Henigman (1):</p>
+<ul>
+  <li>gbm: dlopen libglapi so gbm_create_device works</li>
+</ul>
+
+<p>Ilia Mirkin (9):</p>
+<ul>
+  <li>nvc0/ir: fix collection of first uses for texture barrier insertion</li>
+  <li>nv50,nvc0: clamp uniform size to 64k</li>
+  <li>nvc0/ir: can't have a join on a load with an indirect source</li>
+  <li>glsl: handle conversions to double when comparing param matches</li>
+  <li>glsl: add version checks to conditionals for builtin variable enablement</li>
+  <li>mesa: add GL_PROGRAM_PIPELINE support in KHR_debug calls</li>
+  <li>glsl: binding point is a texture unit, which is a combined space</li>
+  <li>nvc0: always put all tfb bufs into bufctx</li>
+  <li>nv50,nvc0: make sure to pushbuf_refn before putting bo into pushbuf_data</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.6.2.html b/docs/relnotes/10.6.2.html
new file mode 100644
index 00000000000..d95417a8521
--- /dev/null
+++ b/docs/relnotes/10.6.2.html
@@ -0,0 +1,165 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.2 Release Notes / July 11, 2015</h1>
+
+<p>
+Mesa 10.6.2 is a bug fix release which fixes bugs found since the 10.6.1 release.
+</p>
+<p>
+Mesa 10.6.2 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+9c7ab9300dda6c912faaaff97995ec1820ba21d114d9cf555f145cbad90995f4  mesa-10.6.2.tar.gz
+05753d3db4212900927b9894221a1669a10f56786e86a7e818b6e18a0817dca9  mesa-10.6.2.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=73528">Bug 73528</a> - Deferred lighting in Second Life causes system hiccups and screen flickering</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=80500">Bug 80500</a> - Flickering shadows in unreleased title trace</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=82186">Bug 82186</a> - [r600g] BARTS GPU lockup with minecraft shaders</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84225">Bug 84225</a> - Allow constant-index-expression sampler array indexing with GLSL-ES &lt; 300</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90537">Bug 90537</a> - radeonsi bo/va conflict on RADEON_GEM_VA (rscreen-&gt;ws-&gt;buffer_from_handle returns NULL)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90873">Bug 90873</a> - Kernel hang, TearFree On, Mate desktop environment</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91022">Bug 91022</a> - [g45 g965 bisected] assertions generated from textureGrad cube samplers fix</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91047">Bug 91047</a> - [SNB Bisected] Messed up Fog in Super Smash Bros. Melee in Dolphin</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91056">Bug 91056</a> - The Bard's Tale (2005, native)  has rendering issues</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91117">Bug 91117</a> - Nimbus (running in wine) has rendering issues, objects are semi-transparent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91124">Bug 91124</a> - Civilization V (in Wine) has rendering issues: text missing, menu bar corrupted</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91173">Bug 91173</a> - Oddworld: Stranger's Wrath HD: disfigured models in wrong colors</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91226">Bug 91226</a> - Crash in glLinkProgram (NEW)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91231">Bug 91231</a> - [NV92] Psychonauts (native) segfaults on start when DRI3 enabled</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Chris Wilson (1):</p>
+<ul>
+  <li>loader: Look for any version of currently linked libudev.so</li>
+</ul>
+
+<p>Emil Velikov (2):</p>
+<ul>
+  <li>docs: Add sha256 checksums for the 10.6.1 release</li>
+  <li>Update version to 10.6.2</li>
+</ul>
+
+<p>Ilia Mirkin (8):</p>
+<ul>
+  <li>nv50/ir: propagate modifier to right arg when const-folding mad</li>
+  <li>nv50/ir: fix emission of address reg in 3rd source</li>
+  <li>nv50/ir: copy joinAt when splitting both before and after</li>
+  <li>mesa: reset the source packing when creating temp transfer image</li>
+  <li>nv50/ir: don't emit src2 in immediate form</li>
+  <li>mesa/prog: relative offsets into constbufs are not constant</li>
+  <li>nv50/ir: UCMP arguments are float, so make sure modifiers are applied</li>
+  <li>nvc0: turn sample counts off during blit</li>
+</ul>
+
+<p>Kenneth Graunke (5):</p>
+<ul>
+  <li>i965/fs: Fix ir_txs in emit_texture_gen4_simd16().</li>
+  <li>i965: Reserve more batch space to accomodate Gen6 perfmonitors.</li>
+  <li>i965/vs: Fix matNxM vertex attributes where M != 4.</li>
+  <li>Revert "glsl: clone inputs and outputs during linking"</li>
+  <li>Revert "i965: Delete linked GLSL IR when using NIR."</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>r600g: disable single-sample fast color clear due to hangs</li>
+  <li>radeonsi: fix a hang with DrawTransformFeedback on 4 SE chips</li>
+  <li>st/dri: don't set PIPE_BIND_SCANOUT for MSAA surfaces</li>
+</ul>
+
+<p>Mario Kleiner (2):</p>
+<ul>
+  <li>nouveau: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads.</li>
+  <li>winsys/radeon: Use dup fd as key in drm-winsys hash table to fix ZaphodHeads.</li>
+</ul>
+
+<p>Matt Turner (2):</p>
+<ul>
+  <li>i965/fs: Don't mess up stride for uniform integer multiplication.</li>
+  <li>Revert SHA1 additions.</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>winsys/radeon: Unmap GPU VM address range when destroying BO</li>
+</ul>
+
+<p>Mike Stroyan (2):</p>
+<ul>
+  <li>meta: Only change and restore viewport 0 in mesa meta mode</li>
+  <li>i965: allocate at least 1 BLEND_STATE element</li>
+</ul>
+
+<p>Neil Roberts (4):</p>
+<ul>
+  <li>i965/skl: Set the pulls bary bit in 3DSTATE_PS_EXTRA</li>
+  <li>glsl: Add missing check for whether an expression is an add operation</li>
+  <li>glsl: Make sure not to dereference NULL</li>
+  <li>i965: Don't try to print the GLSL IR if it has been freed</li>
+</ul>
+
+<p>Tapani Pälli (8):</p>
+<ul>
+  <li>glsl: clone inputs and outputs during linking</li>
+  <li>i965: Delete linked GLSL IR when using NIR.</li>
+  <li>glsl: Allow dynamic sampler array indexing with GLSL ES &lt; 3.00</li>
+  <li>mesa/glsl: new compiler option EmitNoIndirectSampler</li>
+  <li>i965: use EmitNoIndirectSampler for gen &lt; 7</li>
+  <li>i915: use EmitNoIndirectSampler</li>
+  <li>mesa/st: use EmitNoIndirectSampler if !ARB_gpu_shader5</li>
+  <li>glsl: validate sampler array indexing for 'constant-index-expression'</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.6.3.html b/docs/relnotes/10.6.3.html
new file mode 100644
index 00000000000..1622c87cde2
--- /dev/null
+++ b/docs/relnotes/10.6.3.html
@@ -0,0 +1,106 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.3 Release Notes / July 26, 2015</h1>
+
+<p>
+Mesa 10.6.3 is a bug fix release which fixes bugs found since the 10.6.2 release.
+</p>
+<p>
+Mesa 10.6.3 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+c27e1e33798e69a6d2d2425aee8ac7b4c0b243066a65dd76cbb182ea31b1c7f2  mesa-10.6.3.tar.gz
+58592e07c350cd2e8969b73fa83048c657a39fe2f13f3b88f5e5818fe2e4676d  mesa-10.6.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90728">Bug 90728</a> - dvd playback with vlc and vdpau causes segmentation fault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91337">Bug 91337</a> - OSMesaGetProcAdress(&quot;OSMesaPixelStore&quot;) returns nil</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>osmesa: fix OSMesaPixelsStore typo</li>
+</ul>
+
+<p>Chad Versace (1):</p>
+<ul>
+  <li>mesa: Fix generation of git_sha1.h.tmp for gitlinks</li>
+</ul>
+
+<p>Christian König (2):</p>
+<ul>
+  <li>vl: cleanup video buffer private when the decoder is destroyed</li>
+  <li>st/vdpau: fix mixer size checks</li>
+</ul>
+
+<p>Emil Velikov (3):</p>
+<ul>
+  <li>docs: Add sha256 checksums for the 10.6.2 release</li>
+  <li>auxiliary/vl: use the correct screen index</li>
+  <li>Update version to 10.6.3</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965/gen9: Use custom MOCS entries set up by the kernel.</li>
+</ul>
+
+<p>Ilia Mirkin (5):</p>
+<ul>
+  <li>nv50, nvc0: enable at least one color RT if alphatest is enabled</li>
+  <li>nvc0/ir: fix txq on indirect samplers</li>
+  <li>nvc0/ir: don't worry about sampler in txq handling</li>
+  <li>gm107/ir: fix indirect txq emission</li>
+  <li>nv50: fix max level clamping on G80</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>program: Allow redundant OPTION ARB_fog_* directives.</li>
+</ul>
+
+<p>Rob Clark (1):</p>
+<ul>
+  <li>xa: don't leak fences</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.6.4.html b/docs/relnotes/10.6.4.html
new file mode 100644
index 00000000000..168182ec52e
--- /dev/null
+++ b/docs/relnotes/10.6.4.html
@@ -0,0 +1,137 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.4 Release Notes / August 11, 2015</h1>
+
+<p>
+Mesa 10.6.4 is a bug fix release which fixes bugs found since the 10.6.3 release.
+</p>
+<p>
+Mesa 10.6.4 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+4960bf17d8b5d6a6503c6954ec6cf480b5cd930797bac901c60bea192675f85e  mesa-10.6.4.tar.gz
+8f5ac103f0f503de2f7a985b0df349bd4ecdfe7f51c714be146fa5a9a3c07b77  mesa-10.6.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=73512">Bug 73512</a> - [clover] mesa.icd. should contain full path</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91290">Bug 91290</a> - SIGSEGV glcpp/glcpp-parse.y:1077</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Anuj Phogat (6):</p>
+<ul>
+  <li>mesa: Turn get_readpixels_transfer_ops() in to a global function</li>
+  <li>meta: Fix transfer operations check in meta pbo path for readpixels</li>
+  <li>meta: Abort meta pbo path if readpixels need signed-unsigned conversion</li>
+  <li>meta: Don't do fragment color clamping in _mesa_meta_pbo_GetTexSubImage</li>
+  <li>mesa: Add a helper function _mesa_need_luminance_to_rgb_conversion()</li>
+  <li>meta: Fix reading luminance texture as rgba in _mesa_meta_pbo_GetTexSubImage()</li>
+</ul>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/skl: Add production thread counts and URB size</li>
+</ul>
+
+<p>Eduardo Lima Mitev (3):</p>
+<ul>
+  <li>mesa: Fix errors values returned by glShaderBinary()</li>
+  <li>mesa: Validate target before resolving tex obj in glTex(ture)SubImageXD</li>
+  <li>mesa: Fix error returned by glCopyTexImage2D() upon an invalid internal format</li>
+</ul>
+
+<p>Emil Velikov (6):</p>
+<ul>
+  <li>docs: Add checksums for mesa 10.6.3 tarballs</li>
+  <li>configure.ac: do not set HAVE_DRI(23) when libdrm is missing</li>
+  <li>egl/wayland: libdrm is a hard requirement, treat it as such</li>
+  <li>winsys/radeon: don't leak the fd when it is 0</li>
+  <li>bugzilla_mesa.sh: sort the bugs list by number</li>
+  <li>Update version to 10.6.4</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965/fs: Fix fs_inst::regs_read() for sources in the ATTR file.</li>
+</ul>
+
+<p>Frank Binns (2):</p>
+<ul>
+  <li>egl/dri: Add error info needed for EGL_EXT_image_dma_buf_import extension</li>
+  <li>egl: Add eglQuerySurface surface type check for EGL_LARGEST_PBUFFER attrib</li>
+</ul>
+
+<p>Igor Gnatenko (1):</p>
+<ul>
+  <li>opencl: use versioned .so in mesa.icd</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nvc0: fix geometry program revalidation of clipping params</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>glsl: Fix a bug where LHS swizzles of swizzles were too small.</li>
+</ul>
+
+<p>Marek Olšák (6):</p>
+<ul>
+  <li>st/mesa: don't call st_validate_state in BlitFramebuffer</li>
+  <li>radeonsi: upload shader rodata after updating scratch relocations</li>
+  <li>st/mesa: don't ignore texture buffer state changes</li>
+  <li>radeonsi: rework how shader pointers to descriptors are set</li>
+  <li>radeonsi: completely rework updating descriptors without CP DMA</li>
+  <li>r600g: fix the CB_SHADER_MASK setup</li>
+</ul>
+
+<p>Samuel Iglesias Gonsalvez (1):</p>
+<ul>
+  <li>glsl/glcpp: fix SIGSEGV when checking error condition for macro redefinition</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>nv50: avoid segfault with enabled but unbound vertex attrib</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
deleted file mode 100644
index e089889667d..00000000000
--- a/docs/relnotes/10.7.0.html
+++ /dev/null
@@ -1,61 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-<html lang="en">
-<head>
-  <meta http-equiv="content-type" content="text/html; charset=utf-8">
-  <title>Mesa Release Notes</title>
-  <link rel="stylesheet" type="text/css" href="../mesa.css">
-</head>
-<body>
-
-<div class="header">
-  <h1>The Mesa 3D Graphics Library</h1>
-</div>
-
-<iframe src="../contents.html"></iframe>
-<div class="content">
-
-<h1>Mesa 10.7.0 Release Notes / TBD</h1>
-
-<p>
-Mesa 10.7.0 is a new development release.
-People who are concerned with stability and reliability should stick
-with a previous release or wait for Mesa 10.7.1.
-</p>
-<p>
-Mesa 10.7.0 implements the OpenGL 3.3 API, but the version reported by
-glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
-glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
-Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
-3.3 is <strong>only</strong> available if requested at context creation
-because compatibility contexts are not supported.
-</p>
-
-
-<h2>SHA256 checksums</h2>
-<pre>
-TBD.
-</pre>
-
-
-<h2>New features</h2>
-
-<p>
-Note: some of the new features are only available with certain drivers.
-</p>
-
-<ul>
-<li>GL_ARB_framebuffer_no_attachments on i965</li>
-<li>GL_ARB_shader_stencil_export on llvmpipe</li>
-</ul>
-
-<h2>Bug fixes</h2>
-
-TBD.
-
-<h2>Changes</h2>
-
-TBD.
-
-</div>
-</body>
-</html>
diff --git a/docs/relnotes/11.0.0.html b/docs/relnotes/11.0.0.html
new file mode 100644
index 00000000000..75967ac7eec
--- /dev/null
+++ b/docs/relnotes/11.0.0.html
@@ -0,0 +1,89 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.0 Release Notes / TBD</h1>
+
+<p>
+Mesa 11.0.0 is a new development release.
+People who are concerned with stability and reliability should stick
+with a previous release or wait for Mesa 11.0.1.
+</p>
+<p>
+Mesa 11.0.0 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD.
+</pre>
+
+
+<h2>New features</h2>
+
+<p>
+Note: some of the new features are only available with certain drivers.
+</p>
+
+<ul>
+<li>New hardware support for AMD GCN 1.2 GPUs: Tonga, Iceland, Carrizo, Fiji</li>
+<li>OpenGL 4.1 on radeonsi, nvc0</li>
+<li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
+<li>GL_ARB_conditional_render_inverted on r600, radeonsi</li>
+<li>GL_ARB_derivative_control on radeonsi</li>
+<li>GL_ARB_fragment_layer_viewport on radeonsi</li>
+<li>GL_ARB_framebuffer_no_attachments on i965</li>
+<li>GL_ARB_get_texture_sub_image for all drivers</li>
+<li>GL_ARB_gpu_shader5 on radeonsi</li>
+<li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
+<li>GL_ARB_shader_image_load_store on i965</li>
+<li>GL_ARB_shader_precision on radeonsi, nvc0</li>
+<li>GL_ARB_shader_stencil_export on llvmpipe</li>
+<li>GL_ARB_shader_subroutine on core profile all drivers</li>
+<li>GL_ARB_tessellation_shader on nvc0, radeonsi</li>
+<li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
+<li>GL_ARB_viewport_array on radeonsi</li>
+<li>GL_EXT_depth_bounds_test on radeonsi, nv30, nv50, nvc0</li>
+<li>GL_NV_read_depth (GLES) on all drivers</li>
+<li>GL_NV_read_depth_stencil (GLES) on all drivers</li>
+<li>GL_NV_read_stencil (GLES) on all drivers</li>
+<li>GL_OES_texture_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
+<li>GL_OES_texture_half_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
+<li>GL_OES_texture_float_linear on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
+<li>GL_OES_texture_half_float_linear on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
+<li>GLX_ARB_create_context_robustness on r600, radeonsi</li>
+<li>EGL_EXT_create_context_robustness on r600, radeonsi</li>
+<li>EGL_KHR_gl_colorspace on r600, radeonsi, nv50, nvc0</li>
+<li>EGL_KHR_gl_texture_3D_image on r600, radeonsi, nv50, nvc0</li>
+<li>EGL 1.5 on r600, radeonsi, nv50, nvc0</li>
+</ul>
+
+<h2>Bug fixes</h2>
+
+TBD.
+
+<h2>Changes</h2>
+
+TBD.
+
+</div>
+</body>
+</html>
diff --git a/doxygen/.gitignore b/doxygen/.gitignore
index abf56ac682d..a5f3921b445 100644
--- a/doxygen/.gitignore
+++ b/doxygen/.gitignore
@@ -1,3 +1,4 @@
+*.db
 *.tag
 *.tmp
 agpgart
diff --git a/doxygen/Makefile b/doxygen/Makefile
index 0a95a3516a2..01c2691cfe0 100644
--- a/doxygen/Makefile
+++ b/doxygen/Makefile
@@ -33,3 +33,4 @@ subset: $(SUBSET:.doxy=.tag)
 clean:
 	-rm -rf $(FULL:.doxy=) $(SUBSET:.doxy=)
 	-rm -rf *.tag
+	-rm -rf *.db
diff --git a/include/EGL/eglplatform.h b/include/EGL/eglplatform.h
index 7802542ad0f..b376e642822 100644
--- a/include/EGL/eglplatform.h
+++ b/include/EGL/eglplatform.h
@@ -77,7 +77,7 @@ typedef HDC     EGLNativeDisplayType;
 typedef HBITMAP EGLNativePixmapType;
 typedef HWND    EGLNativeWindowType;
 
-#elif defined(__APPLE__) || defined(__WINSCW__) || defined(__SYMBIAN32__)  /* Symbian */
+#elif defined(__WINSCW__) || defined(__SYMBIAN32__)  /* Symbian */
 
 typedef int   EGLNativeDisplayType;
 typedef void *EGLNativeWindowType;
@@ -105,7 +105,7 @@ typedef struct ANativeWindow*           EGLNativeWindowType;
 typedef struct egl_native_pixmap_t*     EGLNativePixmapType;
 typedef void*                           EGLNativeDisplayType;
 
-#elif defined(__unix__)
+#elif defined(__unix__) || defined(__APPLE__)
 
 #if defined(MESA_EGL_NO_X11_HEADERS)
 
diff --git a/include/GL/glext.h b/include/GL/glext.h
index a3873a613f9..e5f1d891ec5 100644
--- a/include/GL/glext.h
+++ b/include/GL/glext.h
@@ -3879,7 +3879,12 @@ GLAPI void APIENTRY glMinSampleShadingARB (GLfloat value);
 #ifndef GL_ARB_shader_objects
 #define GL_ARB_shader_objects 1
 #ifdef __APPLE__
+#ifdef BUILDING_MESA
+/* Avoid uint <-> void* warnings */
+typedef unsigned long GLhandleARB;
+#else
 typedef void *GLhandleARB;
+#endif
 #else
 typedef unsigned int GLhandleARB;
 #endif
diff --git a/include/GL/glx.h b/include/GL/glx.h
index 78f5052b23a..1e4bb7d7176 100644
--- a/include/GL/glx.h
+++ b/include/GL/glx.h
@@ -368,18 +368,6 @@ extern Bool glXDrawableAttribARB(Display *dpy, GLXDrawable draw, const int *attr
 #endif /* GLX_ARB_render_texture */
 
 
-/*
- * Remove this when glxext.h is updated.
- */
-#ifndef GLX_NV_float_buffer
-#define GLX_NV_float_buffer 1
-
-#define GLX_FLOAT_COMPONENTS_NV         0x20B0
-
-#endif /* GLX_NV_float_buffer */
-
-
-
 /*
  * #?. GLX_MESA_swap_frame_usage
  */
@@ -415,86 +403,6 @@ typedef int (*PFNGLXGETSWAPINTERVALMESAPROC)(void);
 #endif /* GLX_MESA_swap_control */
 
 
-
-/*
- * #?. GLX_EXT_texture_from_pixmap
- * XXX not finished?
- */
-#ifndef GLX_EXT_texture_from_pixmap
-#define GLX_EXT_texture_from_pixmap 1
-
-#define GLX_BIND_TO_TEXTURE_RGB_EXT        0x20D0
-#define GLX_BIND_TO_TEXTURE_RGBA_EXT       0x20D1
-#define GLX_BIND_TO_MIPMAP_TEXTURE_EXT     0x20D2
-#define GLX_BIND_TO_TEXTURE_TARGETS_EXT    0x20D3
-#define GLX_Y_INVERTED_EXT                 0x20D4
-
-#define GLX_TEXTURE_FORMAT_EXT             0x20D5
-#define GLX_TEXTURE_TARGET_EXT             0x20D6
-#define GLX_MIPMAP_TEXTURE_EXT             0x20D7
-
-#define GLX_TEXTURE_FORMAT_NONE_EXT        0x20D8
-#define GLX_TEXTURE_FORMAT_RGB_EXT         0x20D9
-#define GLX_TEXTURE_FORMAT_RGBA_EXT        0x20DA
-
-#define GLX_TEXTURE_1D_BIT_EXT             0x00000001
-#define GLX_TEXTURE_2D_BIT_EXT             0x00000002
-#define GLX_TEXTURE_RECTANGLE_BIT_EXT      0x00000004
-
-#define GLX_TEXTURE_1D_EXT                 0x20DB
-#define GLX_TEXTURE_2D_EXT                 0x20DC
-#define GLX_TEXTURE_RECTANGLE_EXT          0x20DD
-
-#define GLX_FRONT_LEFT_EXT                 0x20DE
-#define GLX_FRONT_RIGHT_EXT                0x20DF
-#define GLX_BACK_LEFT_EXT                  0x20E0
-#define GLX_BACK_RIGHT_EXT                 0x20E1
-#define GLX_FRONT_EXT                      GLX_FRONT_LEFT_EXT
-#define GLX_BACK_EXT                       GLX_BACK_LEFT_EXT
-#define GLX_AUX0_EXT                       0x20E2
-#define GLX_AUX1_EXT                       0x20E3 
-#define GLX_AUX2_EXT                       0x20E4 
-#define GLX_AUX3_EXT                       0x20E5 
-#define GLX_AUX4_EXT                       0x20E6 
-#define GLX_AUX5_EXT                       0x20E7 
-#define GLX_AUX6_EXT                       0x20E8
-#define GLX_AUX7_EXT                       0x20E9 
-#define GLX_AUX8_EXT                       0x20EA 
-#define GLX_AUX9_EXT                       0x20EB
-
-extern void glXBindTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer, const int *attrib_list);
-extern void glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer);
-
-#endif /* GLX_EXT_texture_from_pixmap */
-
-
-#ifndef GLX_MESA_query_renderer
-#define GLX_MESA_query_renderer 1
-
-#define GLX_RENDERER_VENDOR_ID_MESA                      0x8183
-#define GLX_RENDERER_DEVICE_ID_MESA                      0x8184
-#define GLX_RENDERER_VERSION_MESA                        0x8185
-#define GLX_RENDERER_ACCELERATED_MESA                    0x8186
-#define GLX_RENDERER_VIDEO_MEMORY_MESA                   0x8187
-#define GLX_RENDERER_UNIFIED_MEMORY_ARCHITECTURE_MESA    0x8188
-#define GLX_RENDERER_PREFERRED_PROFILE_MESA              0x8189
-#define GLX_RENDERER_OPENGL_CORE_PROFILE_VERSION_MESA    0x818A
-#define GLX_RENDERER_OPENGL_COMPATIBILITY_PROFILE_VERSION_MESA    0x818B
-#define GLX_RENDERER_OPENGL_ES_PROFILE_VERSION_MESA      0x818C
-#define GLX_RENDERER_OPENGL_ES2_PROFILE_VERSION_MESA     0x818D
-#define GLX_RENDERER_ID_MESA                             0x818E
-
-Bool glXQueryRendererIntegerMESA(Display *dpy, int screen, int renderer, int attribute, unsigned int *value);
-Bool glXQueryCurrentRendererIntegerMESA(int attribute, unsigned int *value);
-const char *glXQueryRendererStringMESA(Display *dpy, int screen, int renderer, int attribute);
-const char *glXQueryCurrentRendererStringMESA(int attribute);
-
-typedef Bool (*PFNGLXQUERYRENDERERINTEGERMESAPROC) (Display *dpy, int screen, int renderer, int attribute, unsigned int *value);
-typedef Bool (*PFNGLXQUERYCURRENTRENDERERINTEGERMESAPROC) (int attribute, unsigned int *value);
-typedef const char *(*PFNGLXQUERYRENDERERSTRINGMESAPROC) (Display *dpy, int screen, int renderer, int attribute);
-typedef const char *(*PFNGLXQUERYCURRENTRENDERERSTRINGMESAPROC) (int attribute);
-#endif /* GLX_MESA_query_renderer */
-
 /*** Should these go here, or in another header? */
 /*
 ** GLX Events
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index c827bb640f3..a0f155a1f42 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -40,14 +40,7 @@
 #ifndef DRI_INTERFACE_H
 #define DRI_INTERFACE_H
 
-/* For archs with no drm.h */
-#if defined(__APPLE__) || defined(__CYGWIN__) || defined(__GNU__)
-#ifndef __NOT_HAVE_DRM_H
-#define __NOT_HAVE_DRM_H
-#endif
-#endif
-
-#ifndef __NOT_HAVE_DRM_H
+#ifdef HAVE_LIBDRM
 #include <drm.h>
 #else
 typedef unsigned int drm_context_t;
@@ -1101,12 +1094,15 @@ struct __DRIdri2ExtensionRec {
 
 
 /**
- * Four CC formats that matches with WL_DRM_FORMAT_* from wayland_drm.h
- * and GBM_FORMAT_* from gbm.h, used with createImageFromNames.
+ * Four CC formats that matches with WL_DRM_FORMAT_* from wayland_drm.h,
+ * GBM_FORMAT_* from gbm.h, and DRM_FORMAT_* from drm_fourcc.h. Used with
+ * createImageFromNames.
  *
  * \since 5
  */
 
+#define __DRI_IMAGE_FOURCC_R8		0x20203852
+#define __DRI_IMAGE_FOURCC_GR88		0x38385247
 #define __DRI_IMAGE_FOURCC_RGB565	0x36314752
 #define __DRI_IMAGE_FOURCC_ARGB8888	0x34325241
 #define __DRI_IMAGE_FOURCC_XRGB8888	0x34325258
@@ -1141,6 +1137,8 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_COMPONENTS_Y_U_V	0x3003
 #define __DRI_IMAGE_COMPONENTS_Y_UV	0x3004
 #define __DRI_IMAGE_COMPONENTS_Y_XUXV	0x3005
+#define __DRI_IMAGE_COMPONENTS_R	0x3006
+#define __DRI_IMAGE_COMPONENTS_RG	0x3007
 
 
 /**
@@ -1180,7 +1178,8 @@ enum __DRIChromaSiting {
 };
 
 /**
- * \name Reasons that __DRIimageExtensionRec::createImageFromTexture might fail
+ * \name Reasons that __DRIimageExtensionRec::createImageFromTexture or
+ * __DRIimageExtensionRec::createImageFromDmaBufs might fail
  */
 /*@{*/
 /** Success! */
@@ -1189,11 +1188,14 @@ enum __DRIChromaSiting {
 /** Memory allocation failure */
 #define __DRI_IMAGE_ERROR_BAD_ALLOC     1
 
-/** Client requested an invalid attribute for a texture object  */
+/** Client requested an invalid attribute */
 #define __DRI_IMAGE_ERROR_BAD_MATCH     2
 
 /** Client requested an invalid texture object */
 #define __DRI_IMAGE_ERROR_BAD_PARAMETER 3
+
+/** Client requested an invalid pitch and/or offset */
+#define __DRI_IMAGE_ERROR_BAD_ACCESS    4
 /*@}*/
 
 /**
@@ -1444,6 +1446,11 @@ typedef struct __DRIDriverVtableExtensionRec {
 #define __DRI2_RENDERER_OPENGL_COMPATIBILITY_PROFILE_VERSION  0x0008
 #define __DRI2_RENDERER_OPENGL_ES_PROFILE_VERSION             0x0009
 #define __DRI2_RENDERER_OPENGL_ES2_PROFILE_VERSION            0x000a
+#define __DRI2_RENDERER_HAS_TEXTURE_3D                        0x000b
+/* Whether there is an sRGB format support for every supported 32-bit UNORM
+ * color format.
+ */
+#define __DRI2_RENDERER_HAS_FRAMEBUFFER_SRGB                  0x000c
 
 typedef struct __DRI2rendererQueryExtensionRec __DRI2rendererQueryExtension;
 struct __DRI2rendererQueryExtensionRec {
diff --git a/include/c99_math.h b/include/c99_math.h
index 7ed7cc22176..8a67fb133d6 100644
--- a/include/c99_math.h
+++ b/include/c99_math.h
@@ -140,6 +140,18 @@ llrintf(float f)
    return rounded;
 }
 
+static inline float
+exp2f(float f)
+{
+   return powf(2.0f, f);
+}
+
+static inline double
+exp2(double d)
+{
+   return pow(2.0, d);
+}
+
 #endif /* C99 */
 
 
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index 8d757aaa767..8a425999429 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -128,3 +128,6 @@ CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B3, chv,     "Intel(R) HD Graphics (Cherryview)")
+CHIPSET(0x0A84, bxt,     "Intel(R) HD Graphics (Broxton)")
+CHIPSET(0x1A84, bxt,     "Intel(R) HD Graphics (Broxton)")
+CHIPSET(0x5A84, bxt,     "Intel(R) HD Graphics (Broxton)")
diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h
index cd5da99a6a6..52eada1d3d5 100644
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -63,6 +63,7 @@ CHIPSET(0x6608, OLAND_6608, OLAND)
 CHIPSET(0x6610, OLAND_6610, OLAND)
 CHIPSET(0x6611, OLAND_6611, OLAND)
 CHIPSET(0x6613, OLAND_6613, OLAND)
+CHIPSET(0x6617, OLAND_6617, OLAND)
 CHIPSET(0x6620, OLAND_6620, OLAND)
 CHIPSET(0x6621, OLAND_6621, OLAND)
 CHIPSET(0x6623, OLAND_6623, OLAND)
@@ -156,3 +157,27 @@ CHIPSET(0x67B8, HAWAII_67B8, HAWAII)
 CHIPSET(0x67B9, HAWAII_67B9, HAWAII)
 CHIPSET(0x67BA, HAWAII_67BA, HAWAII)
 CHIPSET(0x67BE, HAWAII_67BE, HAWAII)
+
+CHIPSET(0x6900, ICELAND_, ICELAND)
+CHIPSET(0x6901, ICELAND_, ICELAND)
+CHIPSET(0x6902, ICELAND_, ICELAND)
+CHIPSET(0x6903, ICELAND_, ICELAND)
+CHIPSET(0x6907, ICELAND_, ICELAND)
+
+CHIPSET(0x6920, TONGA_, TONGA)
+CHIPSET(0x6921, TONGA_, TONGA)
+CHIPSET(0x6928, TONGA_, TONGA)
+CHIPSET(0x6929, TONGA_, TONGA)
+CHIPSET(0x692B, TONGA_, TONGA)
+CHIPSET(0x692F, TONGA_, TONGA)
+CHIPSET(0x6930, TONGA_, TONGA)
+CHIPSET(0x6938, TONGA_, TONGA)
+CHIPSET(0x6939, TONGA_, TONGA)
+
+CHIPSET(0x9870, CARRIZO_, CARRIZO)
+CHIPSET(0x9874, CARRIZO_, CARRIZO)
+CHIPSET(0x9875, CARRIZO_, CARRIZO)
+CHIPSET(0x9876, CARRIZO_, CARRIZO)
+CHIPSET(0x9877, CARRIZO_, CARRIZO)
+
+CHIPSET(0x7300, FIJI_, FIJI)
diff --git a/src/Makefile.am b/src/Makefile.am
index d41a087ae1c..da638a811fb 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 SUBDIRS = . gtest util mapi/glapi/gen mapi
 
 if NEED_OPENGL_COMMON
@@ -37,16 +35,12 @@ if HAVE_EGL_PLATFORM_WAYLAND
 SUBDIRS += egl/wayland/wayland-egl egl/wayland/wayland-drm
 endif
 
-if HAVE_EGL_DRIVER_DRI2
-SUBDIRS += egl/drivers/dri2
-endif
-
 if HAVE_GBM
 SUBDIRS += gbm
 endif
 
 if HAVE_EGL
-SUBDIRS += egl/main
+SUBDIRS += egl
 endif
 
 if HAVE_GALLIUM
@@ -54,8 +48,6 @@ SUBDIRS += gallium
 endif
 
 EXTRA_DIST = \
-	egl/drivers/haiku \
-	egl/docs \
 	getopt hgl SConscript
 
 AM_CFLAGS = $(VISIBILITY_CFLAGS)
diff --git a/src/SConscript b/src/SConscript
index b0578e89258..106b87d4251 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -31,13 +31,8 @@ SConscript('mesa/SConscript')
 if not env['embedded']:
     if env['platform'] not in ('cygwin', 'darwin', 'freebsd', 'haiku', 'windows'):
         SConscript('glx/SConscript')
-    if env['platform'] not in ['darwin', 'haiku', 'sunos', 'windows']:
-        if env['dri']:
-            SConscript('egl/drivers/dri2/SConscript')
-        SConscript('egl/main/SConscript')
     if env['platform'] == 'haiku':
-        SConscript('egl/drivers/haiku/SConscript')
-        SConscript('egl/main/SConscript')
+        SConscript('egl/SConscript')
 
     if env['gles']:
         SConscript('mapi/shared-glapi/SConscript')
diff --git a/src/egl/main/Android.mk b/src/egl/Android.mk
similarity index 79%
rename from src/egl/main/Android.mk
rename to src/egl/Android.mk
index 0ba72953960..ebd67af34cc 100644
--- a/src/egl/main/Android.mk
+++ b/src/egl/Android.mk
@@ -27,21 +27,36 @@ LOCAL_PATH := $(call my-dir)
 
 include $(LOCAL_PATH)/Makefile.sources
 
-SOURCES := \
-	${LIBEGL_C_FILES}
-
 # ---------------------------------------
 # Build libGLES_mesa
 # ---------------------------------------
 
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES := $(SOURCES)
+LOCAL_SRC_FILES := \
+	$(LIBEGL_C_FILES) \
+	$(dri2_backend_core_FILES) \
+	drivers/dri2/platform_android.c
 
 LOCAL_CFLAGS := \
 	-D_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_ANDROID \
-	-D_EGL_DRIVER_SEARCH_DIR=\"/system/lib/egl\" \
-	-D_EGL_OS_UNIX=1
+	-D_EGL_BUILT_IN_DRIVER_DRI2 \
+	-DHAVE_ANDROID_PLATFORM
+
+ifeq ($(MESA_LOLLIPOP_BUILD),true)
+LOCAL_CFLAGS_arm := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
+LOCAL_CFLAGS_x86 := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
+LOCAL_CFLAGS_x86_64 := -DDEFAULT_DRIVER_DIR=\"/system/lib64/dri\"
+else
+LOCAL_CFLAGS += -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
+endif
+
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/egl/main \
+	$(MESA_TOP)/src/egl/drivers/dri2 \
+
+LOCAL_STATIC_LIBRARIES := \
+	libmesa_loader
 
 LOCAL_SHARED_LIBRARIES := \
 	libdl \
@@ -55,12 +70,11 @@ LOCAL_SHARED_LIBRARIES += libsync
 endif
 
 # add libdrm if there are hardware drivers
-ifneq ($(MESA_GPU_DRIVERS),swrast)
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES += libdrm
 endif
 
-LOCAL_CFLAGS += -D_EGL_BUILT_IN_DRIVER_DRI2
-
 ifeq ($(strip $(MESA_BUILD_CLASSIC)),true)
 # require i915_dri and/or i965_dri
 LOCAL_REQUIRED_MODULES += \
@@ -71,9 +85,6 @@ ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
 LOCAL_REQUIRED_MODULES += gallium_dri
 endif # MESA_BUILD_GALLIUM
 
-LOCAL_STATIC_LIBRARIES := \
-	libmesa_egl_dri2 \
-	libmesa_loader
 
 LOCAL_MODULE := libGLES_mesa
 ifeq ($(MESA_LOLLIPOP_BUILD),true)
diff --git a/src/egl/main/Makefile.am b/src/egl/Makefile.am
similarity index 69%
rename from src/egl/main/Makefile.am
rename to src/egl/Makefile.am
index 9030d272b53..5c2ba301ffb 100644
--- a/src/egl/main/Makefile.am
+++ b/src/egl/Makefile.am
@@ -23,18 +23,19 @@ include Makefile.sources
 
 AM_CFLAGS = \
 	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/egl/main \
 	-I$(top_srcdir)/src/gbm/main \
+	-I$(top_srcdir)/src \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
+	$(LIBDRM_CFLAGS) \
 	$(EGL_CFLAGS) \
-	-D_EGL_NATIVE_PLATFORM=$(EGL_NATIVE_PLATFORM) \
-	-D_EGL_DRIVER_SEARCH_DIR=\"$(libdir)/egl\" \
-	-D_EGL_OS_UNIX=1
+	-D_EGL_NATIVE_PLATFORM=$(EGL_NATIVE_PLATFORM)
 
 lib_LTLIBRARIES = libEGL.la
 
 libEGL_la_SOURCES = \
-	${LIBEGL_C_FILES}
+	$(LIBEGL_C_FILES)
 
 libEGL_la_LIBADD = \
 	$(EGL_LIB_DEPS)
@@ -45,10 +46,13 @@ libEGL_la_LDFLAGS = \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
 
+dri2_backend_FILES =
+
 if HAVE_EGL_PLATFORM_X11
 AM_CFLAGS += -DHAVE_X11_PLATFORM
 AM_CFLAGS += $(XCB_DRI2_CFLAGS)
 libEGL_la_LIBADD += $(XCB_DRI2_LIBS)
+dri2_backend_FILES += drivers/dri2/platform_x11.c
 endif
 
 if HAVE_EGL_PLATFORM_WAYLAND
@@ -56,26 +60,37 @@ AM_CFLAGS += -DHAVE_WAYLAND_PLATFORM
 AM_CFLAGS += $(WAYLAND_CFLAGS)
 libEGL_la_LIBADD += $(WAYLAND_LIBS)
 libEGL_la_LIBADD += $(LIBDRM_LIBS)
-libEGL_la_LIBADD += ../wayland/wayland-drm/libwayland-drm.la
+libEGL_la_LIBADD += $(top_builddir)/src/egl/wayland/wayland-drm/libwayland-drm.la
+dri2_backend_FILES += drivers/dri2/platform_wayland.c
 endif
 
 if HAVE_EGL_PLATFORM_DRM
 AM_CFLAGS += -DHAVE_DRM_PLATFORM
-libEGL_la_LIBADD += ../../gbm/libgbm.la
-endif
-
-if HAVE_EGL_PLATFORM_NULL
-AM_CFLAGS += -DHAVE_NULL_PLATFORM
+libEGL_la_LIBADD += $(top_builddir)/src/gbm/libgbm.la
+dri2_backend_FILES += drivers/dri2/platform_drm.c
 endif
 
 if HAVE_EGL_PLATFORM_SURFACELESS
 AM_CFLAGS += -DHAVE_SURFACELESS_PLATFORM
+dri2_backend_FILES += drivers/dri2/platform_surfaceless.c
 endif
 
 if HAVE_EGL_DRIVER_DRI2
-AM_CFLAGS += -D_EGL_BUILT_IN_DRIVER_DRI2
-AM_CFLAGS += -DHAVE_XCB_DRI2
-libEGL_la_LIBADD += ../drivers/dri2/libegl_dri2.la
+AM_CFLAGS += \
+	-I$(top_srcdir)/src/loader \
+	-I$(top_srcdir)/src/egl/drivers/dri2 \
+	-I$(top_srcdir)/src/gbm/backends/dri \
+	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
+	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
+	-I$(top_builddir)/src/egl/wayland/wayland-drm \
+	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\" \
+	-D_EGL_BUILT_IN_DRIVER_DRI2
+
+libEGL_la_SOURCES += \
+	$(dri2_backend_core_FILES) \
+	$(dri2_backend_FILES)
+
+libEGL_la_LIBADD += $(top_builddir)/src/loader/libloader.la
 libEGL_la_LIBADD += $(DLOPEN_LIBS) $(LIBDRM_LIBS)
 endif
 
@@ -83,7 +98,7 @@ include $(top_srcdir)/install-lib-links.mk
 
 pkgconfigdir = $(libdir)/pkgconfig
 
-pkgconfig_DATA = egl.pc
+pkgconfig_DATA = main/egl.pc
 
 khrdir = $(includedir)/KHR
 khr_HEADERS = $(top_srcdir)/include/KHR/khrplatform.h
@@ -97,6 +112,8 @@ egl_HEADERS = \
 	$(top_srcdir)/include/EGL/eglplatform.h
 
 EXTRA_DIST = \
-	egl.def \
-	README.txt \
-	SConscript
+	SConscript \
+	drivers/haiku \
+	docs \
+	main/egl.def \
+	main/README.txt
diff --git a/src/egl/Makefile.sources b/src/egl/Makefile.sources
new file mode 100644
index 00000000000..48db8518f8a
--- /dev/null
+++ b/src/egl/Makefile.sources
@@ -0,0 +1,34 @@
+LIBEGL_C_FILES := \
+	main/eglapi.c \
+	main/eglapi.h \
+	main/eglarray.c \
+	main/eglarray.h \
+	main/eglcompiler.h \
+	main/eglconfig.c \
+	main/eglconfig.h \
+	main/eglcontext.c \
+	main/eglcontext.h \
+	main/eglcurrent.c \
+	main/eglcurrent.h \
+	main/egldefines.h \
+	main/egldisplay.c \
+	main/egldisplay.h \
+	main/egldriver.c \
+	main/egldriver.h \
+	main/eglfallbacks.c \
+	main/eglglobals.c \
+	main/eglglobals.h \
+	main/eglimage.c \
+	main/eglimage.h \
+	main/egllog.c \
+	main/egllog.h \
+	main/eglsurface.c \
+	main/eglsurface.h \
+	main/eglsync.c \
+	main/eglsync.h \
+	main/egltypedefs.h
+
+dri2_backend_core_FILES := \
+	drivers/dri2/egl_dri2.c \
+	drivers/dri2/egl_dri2.h \
+	drivers/dri2/egl_dri2_fallbacks.h
diff --git a/src/egl/SConscript b/src/egl/SConscript
new file mode 100644
index 00000000000..1b2a4271ef7
--- /dev/null
+++ b/src/egl/SConscript
@@ -0,0 +1,34 @@
+#######################################################################
+# SConscript for EGL
+
+
+Import('*')
+
+env = env.Clone()
+
+env.Append(CPPPATH = [
+    '#/include',
+    '#/src/egl/main',
+    '#/src',
+])
+
+
+# parse Makefile.sources
+egl_sources = env.ParseSourceList('Makefile.sources', 'LIBEGL_C_FILES')
+egl_sources.append(env.ParseSourceList('Makefile.sources', 'dri2_backend_core_FILES'))
+
+env.Append(CPPDEFINES = [
+    '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
+    '_EGL_BUILT_IN_DRIVER_HAIKU',
+    'HAVE_HAIKU_PLATFORM',
+])
+egl_sources.append('drivers/haiku/egl_haiku.cpp')
+
+egl = env.SharedLibrary(
+    target = 'EGL',
+    source = egl_sources,
+)
+
+egl = env.InstallSharedLibrary(egl, version=(1, 0, 0))
+
+env.Alias('egl', egl)
diff --git a/src/egl/drivers/dri2/Android.mk b/src/egl/drivers/dri2/Android.mk
deleted file mode 100644
index 109e4d4a0d8..00000000000
--- a/src/egl/drivers/dri2/Android.mk
+++ /dev/null
@@ -1,64 +0,0 @@
-# Mesa 3-D graphics library
-#
-# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com>
-# Copyright (C) 2010-2011 LunarG Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-# Android.mk for egl_dri2
-
-LOCAL_PATH := $(call my-dir)
-
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := \
-	egl_dri2.c \
-	platform_android.c
-
-LOCAL_CFLAGS := \
-	-DHAVE_SHARED_GLAPI \
-	-DHAVE_ANDROID_PLATFORM
-
-ifeq ($(MESA_LOLLIPOP_BUILD),true)
-LOCAL_CFLAGS_arm := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
-LOCAL_CFLAGS_x86 := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
-LOCAL_CFLAGS_x86_64 := -DDEFAULT_DRIVER_DIR=\"/system/lib64/dri\"
-else
-LOCAL_CFLAGS += -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
-endif
-
-LOCAL_C_INCLUDES := \
-	$(MESA_TOP)/src/mapi \
-	$(MESA_TOP)/src/egl/main \
-	$(DRM_GRALLOC_TOP)
-
-LOCAL_STATIC_LIBRARIES := \
-	libmesa_loader
-
-LOCAL_SHARED_LIBRARIES := libdrm
-
-ifeq ($(shell echo "$(MESA_ANDROID_VERSION) >= 4.2" | bc),1)
-LOCAL_SHARED_LIBRARIES += \
-	libsync
-endif
-
-LOCAL_MODULE := libmesa_egl_dri2
-
-include $(MESA_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
diff --git a/src/egl/drivers/dri2/Makefile.am b/src/egl/drivers/dri2/Makefile.am
deleted file mode 100644
index 55be4a75ba5..00000000000
--- a/src/egl/drivers/dri2/Makefile.am
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright © 2012 Intel Corporation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-AM_CFLAGS = \
-	-I$(top_srcdir)/include \
-	-I$(top_srcdir)/src/egl/main \
-	-I$(top_srcdir)/src/loader \
-	-I$(top_srcdir)/src/gbm/main \
-	-I$(top_srcdir)/src/gbm/backends/dri \
-	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
-	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
-	-I$(top_builddir)/src/egl/wayland/wayland-drm \
-	$(DEFINES) \
-	$(VISIBILITY_CFLAGS) \
-	$(LIBDRM_CFLAGS) \
-	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\"
-
-noinst_LTLIBRARIES = libegl_dri2.la
-
-libegl_dri2_la_SOURCES = \
-	egl_dri2.c \
-	egl_dri2.h \
-	egl_dri2_fallbacks.h
-
-libegl_dri2_la_LIBADD = \
-	$(top_builddir)/src/loader/libloader.la \
-	$(EGL_LIB_DEPS)
-
-if HAVE_SHARED_GLAPI
-AM_CFLAGS += -DHAVE_SHARED_GLAPI
-endif
-
-if HAVE_EGL_PLATFORM_X11
-libegl_dri2_la_SOURCES += platform_x11.c
-AM_CFLAGS += -DHAVE_X11_PLATFORM
-AM_CFLAGS += $(XCB_DRI2_CFLAGS)
-endif
-
-if HAVE_EGL_PLATFORM_WAYLAND
-libegl_dri2_la_SOURCES += platform_wayland.c
-AM_CFLAGS += -DHAVE_WAYLAND_PLATFORM
-AM_CFLAGS += $(WAYLAND_CFLAGS)
-endif
-
-if HAVE_EGL_PLATFORM_DRM
-libegl_dri2_la_SOURCES += platform_drm.c
-AM_CFLAGS += -DHAVE_DRM_PLATFORM
-endif
-
-if HAVE_EGL_PLATFORM_SURFACELESS
-libegl_dri2_la_SOURCES += platform_surfaceless.c
-AM_CFLAGS += -DHAVE_SURFACELESS_PLATFORM
-endif
-
-EXTRA_DIST = SConscript
diff --git a/src/egl/drivers/dri2/SConscript b/src/egl/drivers/dri2/SConscript
deleted file mode 100644
index 5b03107cbb3..00000000000
--- a/src/egl/drivers/dri2/SConscript
+++ /dev/null
@@ -1,40 +0,0 @@
-Import('*')
-
-env = env.Clone()
-
-env.Append(CPPDEFINES = [
-	'DEFAULT_DRIVER_DIR=\\"\\"'
-])
-
-env.Append(CPPPATH = [
-	'#/include',
-	'#/src/egl/main',
-	'#/src/loader',
-])
-
-sources = [
-	'egl_dri2.c',
-]
-
-if env['x11']:
-	sources.append('platform_x11.c')
-	env.Append(CPPDEFINES = [
-		'HAVE_X11_PLATFORM',
-	])
-	#env.Append(CPPPATH = [
-	#	'XCB_DRI2_CFLAGS',
-	#])
-
-if env['drm']:
-	env.PkgUseModules('DRM')
-
-env.Prepend(LIBS = [
-	libloader,
-])
-
-egl_dri2 = env.ConvenienceLibrary(
-	target = 'egl_dri2',
-	source = sources,
-)
-
-Export('egl_dri2')
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index a1cbd437f53..461735fe9e3 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -28,6 +28,7 @@
 #define WL_HIDE_DEPRECATED
 
 #include <stdint.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
@@ -51,7 +52,23 @@
 #endif
 
 #include "egl_dri2.h"
-#include "../util/u_atomic.h"
+#include "util/u_atomic.h"
+
+/* The kernel header drm_fourcc.h defines the DRM formats below.  We duplicate
+ * some of the definitions here so that building Mesa won't bleeding-edge
+ * kernel headers.
+ */
+#ifndef DRM_FORMAT_R8
+#define DRM_FORMAT_R8            fourcc_code('R', '8', ' ', ' ') /* [7:0] R */
+#endif
+
+#ifndef DRM_FORMAT_RG88
+#define DRM_FORMAT_RG88          fourcc_code('R', 'G', '8', '8') /* [15:0] R:G 8:8 little endian */
+#endif
+
+#ifndef DRM_FORMAT_GR88
+#define DRM_FORMAT_GR88          fourcc_code('G', 'R', '8', '8') /* [15:0] G:R 8:8 little endian */
+#endif
 
 const __DRIuseInvalidateExtension use_invalidate = {
    .base = { __DRI_USE_INVALIDATE, 1 }
@@ -109,6 +126,18 @@ EGLint dri2_to_egl_attribute_map[] = {
    0,				/* __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE */
 };
 
+const __DRIconfig *
+dri2_get_dri_config(struct dri2_egl_config *conf, EGLint surface_type,
+                    EGLenum colorspace)
+{
+   if (colorspace == EGL_GL_COLORSPACE_SRGB_KHR)
+      return surface_type == EGL_WINDOW_BIT ? conf->dri_srgb_double_config :
+                                              conf->dri_srgb_single_config;
+   else
+      return surface_type == EGL_WINDOW_BIT ? conf->dri_double_config :
+                                              conf->dri_single_config;
+}
+
 static EGLBoolean
 dri2_match_config(const _EGLConfig *conf, const _EGLConfig *criteria)
 {
@@ -130,6 +159,7 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
    struct dri2_egl_display *dri2_dpy;
    _EGLConfig base;
    unsigned int attrib, value, double_buffer;
+   bool srgb = false;
    EGLint key, bind_to_texture_rgb, bind_to_texture_rgba;
    unsigned int dri_masks[4] = { 0, 0, 0, 0 };
    _EGLConfig *matching_config;
@@ -139,7 +169,7 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
 
    dri2_dpy = disp->DriverData;
    _eglInitConfig(&base, disp, id);
-   
+
    i = 0;
    double_buffer = 0;
    bind_to_texture_rgb = 0;
@@ -155,7 +185,7 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
 	 else
 	    return NULL;
 	 _eglSetConfigKey(&base, EGL_COLOR_BUFFER_TYPE, value);
-	 break;	 
+	 break;
 
       case __DRI_ATTRIB_CONFIG_CAVEAT:
          if (value & __DRI_ATTRIB_NON_CONFORMANT_CONFIG)
@@ -204,6 +234,10 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
             return NULL;
          break;
 
+      case __DRI_ATTRIB_FRAMEBUFFER_SRGB_CAPABLE:
+         srgb = value != 0;
+         break;
+
       default:
 	 key = dri2_to_egl_attribute_map[attrib];
 	 if (key != 0)
@@ -249,28 +283,35 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
    if (num_configs == 1) {
       conf = (struct dri2_egl_config *) matching_config;
 
-      if (double_buffer && !conf->dri_double_config)
+      if (double_buffer && srgb && !conf->dri_srgb_double_config)
+         conf->dri_srgb_double_config = dri_config;
+      else if (double_buffer && !srgb && !conf->dri_double_config)
          conf->dri_double_config = dri_config;
-      else if (!double_buffer && !conf->dri_single_config)
+      else if (!double_buffer && srgb && !conf->dri_srgb_single_config)
+         conf->dri_srgb_single_config = dri_config;
+      else if (!double_buffer && !srgb && !conf->dri_single_config)
          conf->dri_single_config = dri_config;
       else
          /* a similar config type is already added (unlikely) => discard */
          return NULL;
    }
    else if (num_configs == 0) {
-      conf = malloc(sizeof *conf);
+      conf = calloc(1, sizeof *conf);
       if (conf == NULL)
          return NULL;
 
       memcpy(&conf->base, &base, sizeof base);
       if (double_buffer) {
-         conf->dri_double_config = dri_config;
-         conf->dri_single_config = NULL;
+         if (srgb)
+            conf->dri_srgb_double_config = dri_config;
+         else
+            conf->dri_double_config = dri_config;
       } else {
-         conf->dri_single_config = dri_config;
-         conf->dri_double_config = NULL;
+         if (srgb)
+            conf->dri_srgb_single_config = dri_config;
+         else
+            conf->dri_single_config = dri_config;
       }
-      conf->base.SurfaceType = 0;
       conf->base.ConfigID = config_id;
 
       _eglLinkConfig(&conf->base);
@@ -365,7 +406,7 @@ dri2_bind_extensions(struct dri2_egl_display *dri2_dpy,
 	 }
       }
    }
-   
+
    for (j = 0; matches[j].name; j++) {
       field = ((char *) dri2_dpy + matches[j].offset);
       if (*(const __DRIextension **) field == NULL) {
@@ -500,6 +541,19 @@ dri2_load_driver_swrast(_EGLDisplay *disp)
    return EGL_TRUE;
 }
 
+static unsigned
+dri2_renderer_query_integer(struct dri2_egl_display *dri2_dpy, int param)
+{
+   const __DRI2rendererQueryExtension *rendererQuery = dri2_dpy->rendererQuery;
+   unsigned int value = 0;
+
+   if (!rendererQuery ||
+       rendererQuery->queryInteger(dri2_dpy->dri_screen, param, &value) == -1)
+      return 0;
+
+   return value;
+}
+
 void
 dri2_setup_screen(_EGLDisplay *disp)
 {
@@ -530,6 +584,10 @@ dri2_setup_screen(_EGLDisplay *disp)
    disp->Extensions.KHR_surfaceless_context = EGL_TRUE;
    disp->Extensions.MESA_configless_context = EGL_TRUE;
 
+   if (dri2_renderer_query_integer(dri2_dpy,
+                                   __DRI2_RENDERER_HAS_FRAMEBUFFER_SRGB))
+      disp->Extensions.KHR_gl_colorspace = EGL_TRUE;
+
    if (dri2_dpy->dri2 && dri2_dpy->dri2->base.version >= 3) {
       disp->Extensions.KHR_create_context = EGL_TRUE;
 
@@ -567,6 +625,9 @@ dri2_setup_screen(_EGLDisplay *disp)
          disp->Extensions.KHR_gl_texture_2D_image = EGL_TRUE;
          disp->Extensions.KHR_gl_texture_cubemap_image = EGL_TRUE;
       }
+      if (dri2_renderer_query_integer(dri2_dpy,
+                                      __DRI2_RENDERER_HAS_TEXTURE_3D))
+         disp->Extensions.KHR_gl_texture_3D_image = EGL_TRUE;
 #ifdef HAVE_LIBDRM
       if (dri2_dpy->image->base.version >= 8 &&
           dri2_dpy->image->createImageFromDmaBufs) {
@@ -624,7 +685,7 @@ dri2_create_screen(_EGLDisplay *disp)
    dri2_dpy->own_dri_screen = 1;
 
    extensions = dri2_dpy->core->getExtensions(dri2_dpy->dri_screen);
-   
+
    if (dri2_dpy->dri2) {
       if (!dri2_bind_extensions(dri2_dpy, dri2_core_extensions, extensions))
          goto cleanup_dri_screen;
@@ -644,6 +705,9 @@ dri2_create_screen(_EGLDisplay *disp)
       if (strcmp(extensions[i]->name, __DRI2_FENCE) == 0) {
          dri2_dpy->fence = (__DRI2fenceExtension *) extensions[i];
       }
+      if (strcmp(extensions[i]->name, __DRI2_RENDERER_QUERY) == 0) {
+         dri2_dpy->rendererQuery = (__DRI2rendererQueryExtension *) extensions[i];
+      }
    }
 
    dri2_setup_screen(disp);
@@ -1384,53 +1448,6 @@ dri2_create_image_khr_renderbuffer(_EGLDisplay *disp, _EGLContext *ctx,
    return dri2_create_image_from_dri(disp, dri_image);
 }
 
-#ifdef HAVE_LIBDRM
-static _EGLImage *
-dri2_create_image_mesa_drm_buffer(_EGLDisplay *disp, _EGLContext *ctx,
-				  EGLClientBuffer buffer, const EGLint *attr_list)
-{
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   EGLint format, name, pitch, err;
-   _EGLImageAttribs attrs;
-   __DRIimage *dri_image;
-
-   name = (EGLint) (uintptr_t) buffer;
-
-   err = _eglParseImageAttribList(&attrs, disp, attr_list);
-   if (err != EGL_SUCCESS)
-      return NULL;
-
-   if (attrs.Width <= 0 || attrs.Height <= 0 ||
-       attrs.DRMBufferStrideMESA <= 0) {
-      _eglError(EGL_BAD_PARAMETER,
-		"bad width, height or stride");
-      return NULL;
-   }
-
-   switch (attrs.DRMBufferFormatMESA) {
-   case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA:
-      format = __DRI_IMAGE_FORMAT_ARGB8888;
-      pitch = attrs.DRMBufferStrideMESA;
-      break;
-   default:
-      _eglError(EGL_BAD_PARAMETER,
-		"dri2_create_image_khr: unsupported pixmap depth");
-      return NULL;
-   }
-
-   dri_image =
-      dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
-					   attrs.Width,
-					   attrs.Height,
-					   format,
-					   name,
-					   pitch,
-					   NULL);
-
-   return dri2_create_image_from_dri(disp, dri_image);
-}
-#endif
-
 #ifdef HAVE_WAYLAND_PLATFORM
 
 /* This structure describes how a wl_buffer maps to one or more
@@ -1528,6 +1545,10 @@ dri2_create_image_khr_texture_error(int dri_error)
       egl_error = EGL_BAD_PARAMETER;
       break;
 
+   case __DRI_IMAGE_ERROR_BAD_ACCESS:
+      egl_error = EGL_BAD_ACCESS;
+      break;
+
    default:
       assert(0);
       egl_error = EGL_BAD_MATCH;
@@ -1566,9 +1587,15 @@ dri2_create_image_khr_texture(_EGLDisplay *disp, _EGLContext *ctx,
       gl_target = GL_TEXTURE_2D;
       break;
    case EGL_GL_TEXTURE_3D_KHR:
-      depth = attrs.GLTextureZOffset;
-      gl_target = GL_TEXTURE_3D;
-      break;
+      if (disp->Extensions.KHR_gl_texture_3D_image) {
+         depth = attrs.GLTextureZOffset;
+         gl_target = GL_TEXTURE_3D;
+         break;
+      }
+      else {
+         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+         return EGL_NO_IMAGE_KHR;
+      }
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X_KHR:
    case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y_KHR:
@@ -1621,6 +1648,51 @@ dri2_create_wayland_buffer_from_image(_EGLDriver *drv, _EGLDisplay *dpy,
 }
 
 #ifdef HAVE_LIBDRM
+static _EGLImage *
+dri2_create_image_mesa_drm_buffer(_EGLDisplay *disp, _EGLContext *ctx,
+				  EGLClientBuffer buffer, const EGLint *attr_list)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   EGLint format, name, pitch, err;
+   _EGLImageAttribs attrs;
+   __DRIimage *dri_image;
+
+   name = (EGLint) (uintptr_t) buffer;
+
+   err = _eglParseImageAttribList(&attrs, disp, attr_list);
+   if (err != EGL_SUCCESS)
+      return NULL;
+
+   if (attrs.Width <= 0 || attrs.Height <= 0 ||
+       attrs.DRMBufferStrideMESA <= 0) {
+      _eglError(EGL_BAD_PARAMETER,
+		"bad width, height or stride");
+      return NULL;
+   }
+
+   switch (attrs.DRMBufferFormatMESA) {
+   case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA:
+      format = __DRI_IMAGE_FORMAT_ARGB8888;
+      pitch = attrs.DRMBufferStrideMESA;
+      break;
+   default:
+      _eglError(EGL_BAD_PARAMETER,
+		"dri2_create_image_khr: unsupported pixmap depth");
+      return NULL;
+   }
+
+   dri_image =
+      dri2_dpy->image->createImageFromName(dri2_dpy->dri_screen,
+					   attrs.Width,
+					   attrs.Height,
+					   format,
+					   name,
+					   pitch,
+					   NULL);
+
+   return dri2_create_image_from_dri(disp, dri_image);
+}
+
 static EGLBoolean
 dri2_check_dma_buf_attribs(const _EGLImageAttribs *attrs)
 {
@@ -1673,6 +1745,9 @@ dri2_check_dma_buf_format(const _EGLImageAttribs *attrs)
    unsigned i, plane_n;
 
    switch (attrs->DMABufFourCC.Value) {
+   case DRM_FORMAT_R8:
+   case DRM_FORMAT_RG88:
+   case DRM_FORMAT_GR88:
    case DRM_FORMAT_RGB332:
    case DRM_FORMAT_BGR233:
    case DRM_FORMAT_XRGB4444:
@@ -1850,59 +1925,6 @@ dri2_create_image_dma_buf(_EGLDisplay *disp, _EGLContext *ctx,
 
    return res;
 }
-#endif
-
-_EGLImage *
-dri2_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
-		      _EGLContext *ctx, EGLenum target,
-		      EGLClientBuffer buffer, const EGLint *attr_list)
-{
-   (void) drv;
-
-   switch (target) {
-   case EGL_GL_TEXTURE_2D_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z_KHR:
-   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_KHR:
-      return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
-   case EGL_GL_RENDERBUFFER_KHR:
-      return dri2_create_image_khr_renderbuffer(disp, ctx, buffer, attr_list);
-#ifdef HAVE_LIBDRM
-   case EGL_DRM_BUFFER_MESA:
-      return dri2_create_image_mesa_drm_buffer(disp, ctx, buffer, attr_list);
-#endif
-#ifdef HAVE_WAYLAND_PLATFORM
-   case EGL_WAYLAND_BUFFER_WL:
-      return dri2_create_image_wayland_wl_buffer(disp, ctx, buffer, attr_list);
-#endif
-#ifdef HAVE_LIBDRM
-   case EGL_LINUX_DMA_BUF_EXT:
-      return dri2_create_image_dma_buf(disp, ctx, buffer, attr_list);
-#endif
-   default:
-      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
-      return EGL_NO_IMAGE_KHR;
-   }
-}
-
-static EGLBoolean
-dri2_destroy_image_khr(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *image)
-{
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   struct dri2_egl_image *dri2_img = dri2_egl_image(image);
-
-   (void) drv;
-
-   dri2_dpy->image->destroyImage(dri2_img->dri_image);
-   free(dri2_img);
-
-   return EGL_TRUE;
-}
-
-#ifdef HAVE_LIBDRM
 static _EGLImage *
 dri2_create_drm_image_mesa(_EGLDriver *drv, _EGLDisplay *disp,
 			   const EGLint *attr_list)
@@ -1970,7 +1992,7 @@ dri2_create_drm_image_mesa(_EGLDriver *drv, _EGLDisplay *disp,
    if (attrs.DRMBufferUseMESA & EGL_DRM_BUFFER_USE_CURSOR_MESA)
       dri_use |= __DRI_IMAGE_USE_CURSOR;
 
-   dri2_img->dri_image = 
+   dri2_img->dri_image =
       dri2_dpy->image->createImage(dri2_dpy->dri_screen,
 				   attrs.Width, attrs.Height,
                                    format, dri_use, dri2_img);
@@ -2062,8 +2084,65 @@ dri2_export_dma_buf_image_mesa(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *im
 
    return EGL_TRUE;
 }
+
 #endif
 
+_EGLImage *
+dri2_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
+		      _EGLContext *ctx, EGLenum target,
+		      EGLClientBuffer buffer, const EGLint *attr_list)
+{
+   (void) drv;
+
+   switch (target) {
+   case EGL_GL_TEXTURE_2D_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z_KHR:
+   case EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_KHR:
+      return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
+   case EGL_GL_TEXTURE_3D_KHR:
+      if (disp->Extensions.KHR_gl_texture_3D_image) {
+         return dri2_create_image_khr_texture(disp, ctx, target, buffer, attr_list);
+      }
+      else {
+         _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+         return EGL_NO_IMAGE_KHR;
+      }
+   case EGL_GL_RENDERBUFFER_KHR:
+      return dri2_create_image_khr_renderbuffer(disp, ctx, buffer, attr_list);
+#ifdef HAVE_LIBDRM
+   case EGL_DRM_BUFFER_MESA:
+      return dri2_create_image_mesa_drm_buffer(disp, ctx, buffer, attr_list);
+   case EGL_LINUX_DMA_BUF_EXT:
+      return dri2_create_image_dma_buf(disp, ctx, buffer, attr_list);
+#endif
+#ifdef HAVE_WAYLAND_PLATFORM
+   case EGL_WAYLAND_BUFFER_WL:
+      return dri2_create_image_wayland_wl_buffer(disp, ctx, buffer, attr_list);
+#endif
+   default:
+      _eglError(EGL_BAD_PARAMETER, "dri2_create_image_khr");
+      return EGL_NO_IMAGE_KHR;
+   }
+}
+
+static EGLBoolean
+dri2_destroy_image_khr(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *image)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_image *dri2_img = dri2_egl_image(image);
+
+   (void) drv;
+
+   dri2_dpy->image->destroyImage(dri2_img->dri_image);
+   free(dri2_img);
+
+   return EGL_TRUE;
+}
+
 #ifdef HAVE_WAYLAND_PLATFORM
 
 static void
@@ -2141,13 +2220,11 @@ dri2_bind_wayland_display_wl(_EGLDriver *drv, _EGLDisplay *disp,
    wl_drm_callbacks.authenticate =
       (int(*)(void *, uint32_t)) dri2_dpy->vtbl->authenticate;
 
-#ifdef HAVE_LIBDRM
    if (drmGetCap(dri2_dpy->fd, DRM_CAP_PRIME, &cap) == 0 &&
        cap == (DRM_PRIME_CAP_IMPORT | DRM_PRIME_CAP_EXPORT) &&
        dri2_dpy->image->base.version >= 7 &&
        dri2_dpy->image->createImageFromFds != NULL)
       flags |= WAYLAND_DRM_PRIME;
-#endif
 
    dri2_dpy->wl_server_drm =
 	   wayland_drm_init(wl_dpy, dri2_dpy->device_name,
@@ -2351,18 +2428,12 @@ static EGLBoolean
 dri2_load(_EGLDriver *drv)
 {
    struct dri2_egl_driver *dri2_drv = dri2_egl_driver(drv);
-#ifdef HAVE_SHARED_GLAPI
 #ifdef HAVE_ANDROID_PLATFORM
    const char *libname = "libglapi.so";
+#elif defined(__APPLE__)
+   const char *libname = "libglapi.0.dylib";
 #else
    const char *libname = "libglapi.so.0";
-#endif
-#else
-   /*
-    * Both libGL.so and libglapi.so are glapi providers.  There is no way to
-    * tell which one to load.
-    */
-   const char *libname = NULL;
 #endif
    void *handle;
 
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 9985c49f984..9aa2a8c1003 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -120,9 +120,9 @@ struct dri2_egl_display_vtbl {
    EGLBoolean (*swap_buffers)(_EGLDriver *drv, _EGLDisplay *dpy,
                               _EGLSurface *surf);
 
-   EGLBoolean (*swap_buffers_with_damage)(_EGLDriver *drv, _EGLDisplay *dpy,     
-                                          _EGLSurface *surface,                  
-                                          const EGLint *rects, EGLint n_rects);  
+   EGLBoolean (*swap_buffers_with_damage)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                          _EGLSurface *surface,
+                                          const EGLint *rects, EGLint n_rects);
 
    EGLBoolean (*swap_buffers_region)(_EGLDriver *drv, _EGLDisplay *dpy,
                                      _EGLSurface *surf, EGLint numRects,
@@ -166,6 +166,7 @@ struct dri2_egl_display
    const __DRIrobustnessExtension *robustness;
    const __DRI2configQueryExtension *config;
    const __DRI2fenceExtension *fence;
+   const __DRI2rendererQueryExtension *rendererQuery;
    int                       fd;
 
    int                       own_device;
@@ -285,6 +286,8 @@ struct dri2_egl_config
    _EGLConfig         base;
    const __DRIconfig *dri_single_config;
    const __DRIconfig *dri_double_config;
+   const __DRIconfig *dri_srgb_single_config;
+   const __DRIconfig *dri_srgb_double_config;
 };
 
 struct dri2_egl_image
@@ -357,4 +360,8 @@ dri2_initialize_surfaceless(_EGLDriver *drv, _EGLDisplay *disp);
 void
 dri2_flush_drawable_for_swapbuffers(_EGLDisplay *disp, _EGLSurface *draw);
 
+const __DRIconfig *
+dri2_get_dri_config(struct dri2_egl_config *conf, EGLint surface_type,
+                    EGLenum colorspace);
+
 #endif /* EGL_DRI2_INCLUDED */
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index fed3073088a..4abe82f63a0 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -199,6 +199,7 @@ droid_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
    struct dri2_egl_surface *dri2_surf;
    struct ANativeWindow *window = native_window;
+   const __DRIconfig *config;
 
    dri2_surf = calloc(1, sizeof *dri2_surf);
    if (!dri2_surf) {
@@ -230,9 +231,11 @@ droid_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
       window->query(window, NATIVE_WINDOW_HEIGHT, &dri2_surf->base.Height);
    }
 
+   config = dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
+                                dri2_surf->base.GLColorspace);
+
    dri2_surf->dri_drawable =
-      (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen,
-					   dri2_conf->dri_double_config,
+      (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen, config,
                                            dri2_surf);
    if (dri2_surf->dri_drawable == NULL) {
       _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index a62da4121fe..a439a3be6b6 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -115,8 +115,11 @@ dri2_drm_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
 
    switch (type) {
    case EGL_WINDOW_BIT:
-      if (!window)
-         return NULL;
+      if (!window) {
+         _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+         goto cleanup_surf;
+      }
+
       surf = gbm_dri_surface(window);
       dri2_surf->gbm_surf = surf;
       dri2_surf->base.Width =  surf->base.width;
@@ -128,10 +131,13 @@ dri2_drm_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    }
 
    if (dri2_dpy->dri2) {
+      const __DRIconfig *config =
+         dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
+                             dri2_surf->base.GLColorspace);
+
       dri2_surf->dri_drawable =
-         (*dri2_dpy->dri2->createNewDrawable) (dri2_dpy->dri_screen,
-                                               dri2_conf->dri_double_config,
-                                               dri2_surf->gbm_surf);
+         (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen, config,
+                                              dri2_surf->gbm_surf);
 
    } else {
       assert(dri2_dpy->swrast != NULL);
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 1c985523862..dabaf1ebbd1 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -65,7 +65,7 @@ sync_callback(void *data, struct wl_callback *callback, uint32_t serial)
 }
 
 static const struct wl_callback_listener sync_listener = {
-   sync_callback
+   .done = sync_callback
 };
 
 static int
@@ -104,8 +104,8 @@ wl_buffer_release(void *data, struct wl_buffer *buffer)
    dri2_surf->color_buffers[i].locked = 0;
 }
 
-static struct wl_buffer_listener wl_buffer_listener = {
-   wl_buffer_release
+static const struct wl_buffer_listener wl_buffer_listener = {
+   .release = wl_buffer_release
 };
 
 static void
@@ -130,6 +130,7 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
    struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
    struct wl_egl_window *window = native_window;
    struct dri2_egl_surface *dri2_surf;
+   const __DRIconfig *config;
 
    (void) drv;
 
@@ -138,7 +139,7 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
       _eglError(EGL_BAD_ALLOC, "dri2_create_surface");
       return NULL;
    }
-   
+
    if (!_eglInitSurface(&dri2_surf->base, disp, EGL_WINDOW_BIT, conf, attrib_list))
       goto cleanup_surf;
 
@@ -149,6 +150,11 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
    else
       dri2_surf->format = WL_DRM_FORMAT_ARGB8888;
 
+   if (!window) {
+      _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+      goto cleanup_surf;
+   }
+
    dri2_surf->wl_win = window;
 
    dri2_surf->wl_win->private = dri2_surf;
@@ -157,19 +163,19 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
    dri2_surf->base.Width =  -1;
    dri2_surf->base.Height = -1;
 
+   config = dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
+                                dri2_surf->base.GLColorspace);
+
    dri2_surf->dri_drawable = 
-      (*dri2_dpy->dri2->createNewDrawable) (dri2_dpy->dri_screen,
-					    dri2_conf->dri_double_config,
-					    dri2_surf);
+      (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen, config,
+                                           dri2_surf);
    if (dri2_surf->dri_drawable == NULL) {
       _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
-      goto cleanup_dri_drawable;
+      goto cleanup_surf;
    }
 
    return &dri2_surf->base;
 
- cleanup_dri_drawable:
-   dri2_dpy->core->destroyDrawable(dri2_surf->dri_drawable);
  cleanup_surf:
    free(dri2_surf);
 
@@ -361,7 +367,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
    }
 
    if (dri2_surf->back->dri_image == NULL) {
-      dri2_surf->back->dri_image = 
+      dri2_surf->back->dri_image =
          dri2_dpy->image->createImage(dri2_dpy->dri_screen,
                                       dri2_surf->base.Width,
                                       dri2_surf->base.Height,
@@ -595,7 +601,7 @@ wayland_throttle_callback(void *data,
 }
 
 static const struct wl_callback_listener throttle_listener = {
-   wayland_throttle_callback
+   .done = wayland_throttle_callback
 };
 
 static void
@@ -839,22 +845,6 @@ bad_format:
    return NULL;
 }
 
-static char
-is_fd_render_node(int fd)
-{
-   struct stat render;
-
-   if (fstat(fd, &render))
-      return 0;
-
-   if (!S_ISCHR(render.st_mode))
-      return 0;
-
-   if (render.st_rdev & 0x80)
-      return 1;
-   return 0;
-}
-
 static int
 dri2_wl_authenticate(_EGLDisplay *disp, uint32_t id)
 {
@@ -898,7 +888,7 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device)
       return;
    }
 
-   if (is_fd_render_node(dri2_dpy->fd)) {
+   if (drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER) {
       dri2_dpy->authenticated = 1;
    } else {
       drmGetMagic(dri2_dpy->fd, &magic);
@@ -941,10 +931,10 @@ drm_handle_authenticated(void *data, struct wl_drm *drm)
 }
 
 static const struct wl_drm_listener drm_listener = {
-	drm_handle_device,
-	drm_handle_format,
-	drm_handle_authenticated,
-	drm_handle_capabilities
+   .device = drm_handle_device,
+   .format = drm_handle_format,
+   .authenticated = drm_handle_authenticated,
+   .capabilities = drm_handle_capabilities
 };
 
 static void
@@ -969,8 +959,8 @@ registry_handle_global_remove(void *data, struct wl_registry *registry,
 }
 
 static const struct wl_registry_listener registry_listener_drm = {
-   registry_handle_global_drm,
-   registry_handle_global_remove
+   .global = registry_handle_global_drm,
+   .global_remove = registry_handle_global_remove
 };
 
 static EGLBoolean
@@ -1108,7 +1098,7 @@ dri2_initialize_wayland_drm(_EGLDriver *drv, _EGLDisplay *disp)
     * will return a render-node when the requested gpu is different
     * to the server, but also if the client asks for the same gpu than
     * the server by requesting its pci-id */
-   dri2_dpy->is_render_node = is_fd_render_node(dri2_dpy->fd);
+   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER;
 
    dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd, 0);
    if (dri2_dpy->driver_name == NULL) {
@@ -1220,7 +1210,7 @@ dri2_initialize_wayland_drm(_EGLDriver *drv, _EGLDisplay *disp)
    wl_event_queue_destroy(dri2_dpy->wl_queue);
  cleanup_dpy:
    free(dri2_dpy);
-   
+
    return EGL_FALSE;
 }
 
@@ -1726,7 +1716,7 @@ shm_handle_format(void *data, struct wl_shm *shm, uint32_t format)
 }
 
 static const struct wl_shm_listener shm_listener = {
-   shm_handle_format
+   .format = shm_handle_format
 };
 
 static void
@@ -1743,8 +1733,8 @@ registry_handle_global_swrast(void *data, struct wl_registry *registry, uint32_t
 }
 
 static const struct wl_registry_listener registry_listener_swrast = {
-   registry_handle_global_swrast,
-   registry_handle_global_remove
+   .global = registry_handle_global_swrast,
+   .global_remove = registry_handle_global_remove
 };
 
 static struct dri2_egl_display_vtbl dri2_wl_swrast_display_vtbl = {
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 56c14288204..bf7d2bea4c1 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -56,7 +56,7 @@ swrastCreateDrawable(struct dri2_egl_display * dri2_dpy,
    uint32_t           mask;
    const uint32_t     function = GXcopy;
    uint32_t           valgc[2];
-   
+
    /* create GC's */
    dri2_surf->gc = xcb_generate_id(dri2_dpy->conn);
    mask = XCB_GC_FUNCTION;
@@ -226,7 +226,7 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
       s = xcb_setup_roots_iterator(xcb_get_setup(dri2_dpy->conn));
       screen = get_xcb_screen(s, dri2_dpy->screen);
       if (!screen) {
-         _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+         _eglError(EGL_BAD_ALLOC, "failed to get xcb screen");
          goto cleanup_surf;
       }
 
@@ -235,16 +235,23 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
                        dri2_surf->drawable, screen->root,
 			dri2_surf->base.Width, dri2_surf->base.Height);
    } else {
+      if (!drawable) {
+         if (type == EGL_WINDOW_BIT)
+            _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_create_surface");
+         else
+            _eglError(EGL_BAD_NATIVE_PIXMAP, "dri2_create_surface");
+         goto cleanup_surf;
+      }
       dri2_surf->drawable = drawable;
    }
 
    if (dri2_dpy->dri2) {
-      dri2_surf->dri_drawable = 
-	 (*dri2_dpy->dri2->createNewDrawable) (dri2_dpy->dri_screen,
-					       type == EGL_WINDOW_BIT ?
-					       dri2_conf->dri_double_config : 
-					       dri2_conf->dri_single_config,
-					       dri2_surf);
+      const __DRIconfig *config =
+         dri2_get_dri_config(dri2_conf, type, dri2_surf->base.GLColorspace);
+
+      dri2_surf->dri_drawable =
+	 (*dri2_dpy->dri2->createNewDrawable)(dri2_dpy->dri_screen, config,
+					      dri2_surf);
    } else {
       assert(dri2_dpy->swrast);
       dri2_surf->dri_drawable = 
@@ -261,10 +268,18 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    if (type != EGL_PBUFFER_BIT) {
       cookie = xcb_get_geometry (dri2_dpy->conn, dri2_surf->drawable);
       reply = xcb_get_geometry_reply (dri2_dpy->conn, cookie, &error);
-      if (reply == NULL || error != NULL) {
-	 _eglError(EGL_BAD_ALLOC, "xcb_get_geometry");
-	 free(error);
-	 goto cleanup_dri_drawable;
+      if (error != NULL) {
+         if (error->error_code == BadAlloc)
+            _eglError(EGL_BAD_ALLOC, "xcb_get_geometry");
+         else if (type == EGL_WINDOW_BIT)
+            _eglError(EGL_BAD_NATIVE_WINDOW, "xcb_get_geometry");
+         else
+            _eglError(EGL_BAD_NATIVE_PIXMAP, "xcb_get_geometry");
+         free(error);
+         goto cleanup_dri_drawable;
+      } else if (reply == NULL) {
+         _eglError(EGL_BAD_ALLOC, "xcb_get_geometry");
+         goto cleanup_dri_drawable;
       }
 
       dri2_surf->base.Width = reply->width;
@@ -274,7 +289,25 @@ dri2_x11_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    }
 
    if (dri2_dpy->dri2) {
-      xcb_dri2_create_drawable (dri2_dpy->conn, dri2_surf->drawable);
+      xcb_void_cookie_t cookie;
+      int conn_error;
+
+      cookie = xcb_dri2_create_drawable_checked(dri2_dpy->conn,
+                                                dri2_surf->drawable);
+      error = xcb_request_check(dri2_dpy->conn, cookie);
+      conn_error = xcb_connection_has_error(dri2_dpy->conn);
+      if (conn_error || error != NULL) {
+         if (type == EGL_PBUFFER_BIT || conn_error || error->error_code == BadAlloc)
+            _eglError(EGL_BAD_ALLOC, "xcb_dri2_create_drawable_checked");
+         else if (type == EGL_WINDOW_BIT)
+            _eglError(EGL_BAD_NATIVE_WINDOW,
+                      "xcb_dri2_create_drawable_checked");
+         else
+            _eglError(EGL_BAD_NATIVE_PIXMAP,
+                      "xcb_dri2_create_drawable_checked");
+         free(error);
+         goto cleanup_dri_drawable;
+      }
    } else {
       if (type == EGL_PBUFFER_BIT) {
          dri2_surf->depth = _eglGetConfigKey(conf, EGL_BUFFER_SIZE);
@@ -515,7 +548,7 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
    xcb_generic_error_t *error;
    xcb_screen_iterator_t s;
    xcb_screen_t *screen;
-   char *driver_name, *device_name;
+   char *driver_name, *loader_driver_name, *device_name;
    const xcb_query_extension_reply_t *extension;
 
    xcb_prefetch_extension_data (dri2_dpy->conn, &xcb_xfixes_id);
@@ -540,7 +573,7 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
    s = xcb_setup_roots_iterator(xcb_get_setup(dri2_dpy->conn));
    screen = get_xcb_screen(s, dri2_dpy->screen);
    if (!screen) {
-      _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_x11_connect");
+      _eglLog(_EGL_WARNING, "DRI2: failed to get xcb screen");
       return EGL_FALSE;
    }
    connect_cookie = xcb_dri2_connect_unchecked(dri2_dpy->conn, screen->root,
@@ -575,18 +608,38 @@ dri2_x11_connect(struct dri2_egl_display *dri2_dpy)
       return EGL_FALSE;
    }
 
-   driver_name = xcb_dri2_connect_driver_name (connect);
-   dri2_dpy->driver_name =
-      strndup(driver_name,
-              xcb_dri2_connect_driver_name_length(connect));
-
    device_name = xcb_dri2_connect_device_name (connect);
 
    dri2_dpy->device_name =
       strndup(device_name,
               xcb_dri2_connect_device_name_length(connect));
 
+   dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
+   if (dri2_dpy->fd == -1) {
+      _eglLog(_EGL_WARNING,
+              "DRI2: could not open %s (%s)", dri2_dpy->device_name,
+              strerror(errno));
+      free(dri2_dpy->device_name);
+      free(connect);
+      return EGL_FALSE;
+   }
+
+   driver_name = xcb_dri2_connect_driver_name (connect);
+
+   /* If Mesa knows about the appropriate driver for this fd, then trust it.
+    * Otherwise, default to the server's value.
+    */
+   loader_driver_name = loader_get_driver_for_fd(dri2_dpy->fd, 0);
+   if (loader_driver_name) {
+      dri2_dpy->driver_name = loader_driver_name;
+   } else {
+      dri2_dpy->driver_name =
+         strndup(driver_name,
+                 xcb_dri2_connect_driver_name_length(connect));
+   }
+
    if (dri2_dpy->device_name == NULL || dri2_dpy->driver_name == NULL) {
+      close(dri2_dpy->fd);
       free(dri2_dpy->device_name);
       free(dri2_dpy->driver_name);
       free(connect);
@@ -611,7 +664,7 @@ dri2_x11_authenticate(_EGLDisplay *disp, uint32_t id)
 
    screen = get_xcb_screen(s, dri2_dpy->screen);
    if (!screen) {
-      _eglError(EGL_BAD_NATIVE_WINDOW, "dri2_x11_authenticate");
+      _eglLog(_EGL_WARNING, "DRI2: failed to get xcb screen");
       return -1;
    }
 
@@ -1099,7 +1152,7 @@ dri2_initialize_x11_swrast(_EGLDriver *drv, _EGLDisplay *disp)
       dri2_dpy->screen = DefaultScreen(dpy);
    }
 
-   if (xcb_connection_has_error(dri2_dpy->conn)) {
+   if (!dri2_dpy->conn || xcb_connection_has_error(dri2_dpy->conn)) {
       _eglLog(_EGL_WARNING, "DRI2: xcb_connect failed");
       goto cleanup_dpy;
    }
@@ -1125,10 +1178,8 @@ dri2_initialize_x11_swrast(_EGLDriver *drv, _EGLDisplay *disp)
    if (!dri2_create_screen(disp))
       goto cleanup_driver;
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
-         goto cleanup_configs;
-   }
+   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
+      goto cleanup_configs;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
@@ -1218,31 +1269,19 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
       dri2_dpy->screen = DefaultScreen(dpy);
    }
 
-   if (xcb_connection_has_error(dri2_dpy->conn)) {
+   if (!dri2_dpy->conn || xcb_connection_has_error(dri2_dpy->conn)) {
       _eglLog(_EGL_WARNING, "DRI2: xcb_connect failed");
       goto cleanup_dpy;
    }
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_connect(dri2_dpy))
-	 goto cleanup_conn;
-   }
-
-   if (!dri2_load_driver(disp))
+   if (!dri2_x11_connect(dri2_dpy))
       goto cleanup_conn;
 
-   dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
-   if (dri2_dpy->fd == -1) {
-      _eglLog(_EGL_WARNING,
-	      "DRI2: could not open %s (%s)", dri2_dpy->device_name,
-              strerror(errno));
-      goto cleanup_driver;
-   }
+   if (!dri2_x11_local_authenticate(disp))
+      goto cleanup_fd;
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_local_authenticate(disp))
-	 goto cleanup_fd;
-   }
+   if (!dri2_load_driver(disp))
+      goto cleanup_fd;
 
    if (dri2_dpy->dri2_minor >= 1) {
       dri2_dpy->dri2_loader_extension.base.name = __DRI_DRI2_LOADER;
@@ -1267,7 +1306,7 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
    dri2_dpy->invalidate_available = (dri2_dpy->dri2_minor >= 3);
 
    if (!dri2_create_screen(disp))
-      goto cleanup_fd;
+      goto cleanup_driver;
 
    dri2_x11_setup_swap_interval(dri2_dpy);
 
@@ -1281,10 +1320,8 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
    disp->Extensions.WL_bind_wayland_display = EGL_TRUE;
 #endif
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
-	 goto cleanup_configs;
-   }
+   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
+      goto cleanup_configs;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
@@ -1296,10 +1333,10 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
  cleanup_configs:
    _eglCleanupDisplay(disp);
    dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
- cleanup_fd:
-   close(dri2_dpy->fd);
  cleanup_driver:
    dlclose(dri2_dpy->driver);
+ cleanup_fd:
+   close(dri2_dpy->fd);
  cleanup_conn:
    if (disp->PlatformDisplay == NULL)
       xcb_disconnect(dri2_dpy->conn);
diff --git a/src/egl/drivers/haiku/SConscript b/src/egl/drivers/haiku/SConscript
deleted file mode 100644
index ec6020ece77..00000000000
--- a/src/egl/drivers/haiku/SConscript
+++ /dev/null
@@ -1,29 +0,0 @@
-Import('*')
-
-env = env.Clone()
-
-env.Append(CPPDEFINES = [
-	'DEFAULT_DRIVER_DIR=\\"\\"',
-])
-
-env.Append(CPPPATH = [
-	'#/include',
-	'#/src/egl/main',
-])
-
-sources = [
-	'egl_haiku.cpp'
-]
-
-if env['platform'] == 'haiku':
-	env.Append(CPPDEFINES = [
-		'HAVE_HAIKU_PLATFORM',
-		'_EGL_NATIVE_PLATFORM=haiku',
-	])
-
-egl_haiku = env.ConvenienceLibrary(
-	target = 'egl_haiku',
-	source = sources,
-)
-
-Export('egl_haiku')
diff --git a/src/egl/drivers/haiku/egl_haiku.cpp b/src/egl/drivers/haiku/egl_haiku.cpp
index 3d00e47c8e6..ef74f657b14 100644
--- a/src/egl/drivers/haiku/egl_haiku.cpp
+++ b/src/egl/drivers/haiku/egl_haiku.cpp
@@ -92,8 +92,11 @@ haiku_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
 		return NULL;
 	}
 
-	if (!_eglInitSurface(&surface->surf, disp, EGL_WINDOW_BIT, conf, attrib_list))
-		goto cleanup_surface;
+	if (!_eglInitSurface(&surface->surf, disp, EGL_WINDOW_BIT,
+		conf, attrib_list)) {
+		free(surface);
+		return NULL;
+	}
 
 	(&surface->surf)->SwapInterval = 1;
 
@@ -110,10 +113,6 @@ haiku_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
 	TRACE("Showing window\n");
 	win->Show();
 	return &surface->surf;
-
-cleanup_surface:
-	free(surface);
-	return NULL;
 }
 
 
@@ -139,7 +138,7 @@ haiku_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
 	if (_eglPutSurface(surf)) {
 		// XXX: detach haiku_egl_surface::gl from the native window and destroy it
 		free(surf);
-        }
+	}
 	return EGL_TRUE;
 }
 
@@ -153,7 +152,7 @@ haiku_add_configs_for_visuals(_EGLDisplay *dpy)
 	conf = (struct haiku_egl_config*) calloc(1, sizeof (*conf));
 	if (!conf) {
 		_eglError(EGL_BAD_ALLOC, "haiku_add_configs_for_visuals");
-		return NULL;
+		return EGL_FALSE;
 	}
 
 	_eglInitConfig(&conf->base, dpy, 1);
@@ -165,7 +164,7 @@ haiku_add_configs_for_visuals(_EGLDisplay *dpy)
 	_eglSetConfigKey(&conf->base, EGL_LUMINANCE_SIZE, 0);
 	_eglSetConfigKey(&conf->base, EGL_ALPHA_SIZE, 8);
 	_eglSetConfigKey(&conf->base, EGL_COLOR_BUFFER_TYPE, EGL_RGB_BUFFER);
-	EGLint r = (_eglGetConfigKey(&conf->base, EGL_RED_SIZE) 
+	EGLint r = (_eglGetConfigKey(&conf->base, EGL_RED_SIZE)
 		+ _eglGetConfigKey(&conf->base, EGL_GREEN_SIZE)
 		+ _eglGetConfigKey(&conf->base, EGL_BLUE_SIZE)
 		+ _eglGetConfigKey(&conf->base, EGL_ALPHA_SIZE));
@@ -195,7 +194,7 @@ haiku_add_configs_for_visuals(_EGLDisplay *dpy)
 		goto cleanup;
 	}
 	TRACE("Validated config\n");
-   
+
 	_eglLinkConfig(&conf->base);
 	if (!_eglGetArraySize(dpy->Configs)) {
 		_eglLog(_EGL_WARNING, "Haiku: failed to create any config");
@@ -210,6 +209,7 @@ cleanup:
 	return EGL_FALSE;
 }
 
+
 extern "C"
 EGLBoolean
 init_haiku(_EGLDriver *drv, _EGLDisplay *dpy)
@@ -221,7 +221,7 @@ init_haiku(_EGLDriver *drv, _EGLDisplay *dpy)
 		return EGL_FALSE;
 
 	dpy->Version = 14;
-   
+
 	TRACE("Initialization finished\n");
 
 	return EGL_TRUE;
@@ -271,7 +271,7 @@ haiku_destroy_context(_EGLDriver* drv, _EGLDisplay *disp, _EGLContext* ctx)
 	if (_eglPutContext(ctx)) {
 		// XXX: teardown the context ?
 		free(context);
-		ctx = NULL
+		ctx = NULL;
 	}
 	return EGL_TRUE;
 }
@@ -280,7 +280,7 @@ haiku_destroy_context(_EGLDriver* drv, _EGLDisplay *disp, _EGLContext* ctx)
 extern "C"
 EGLBoolean
 haiku_make_current(_EGLDriver* drv, _EGLDisplay* dpy, _EGLSurface *dsurf,
-		  _EGLSurface *rsurf, _EGLContext *ctx)
+	_EGLSurface *rsurf, _EGLContext *ctx)
 {
 	CALLED();
 
@@ -314,7 +314,7 @@ extern "C"
 void
 haiku_unload(_EGLDriver* drv)
 {
-	
+
 }
 
 
diff --git a/src/egl/main/Makefile.sources b/src/egl/main/Makefile.sources
deleted file mode 100644
index e39a80f14a6..00000000000
--- a/src/egl/main/Makefile.sources
+++ /dev/null
@@ -1,31 +0,0 @@
-LIBEGL_C_FILES := \
-	eglapi.c \
-	eglapi.h \
-	eglarray.c \
-	eglarray.h \
-	eglcompiler.h \
-	eglconfig.c \
-	eglconfig.h \
-	eglcontext.c \
-	eglcontext.h \
-	eglcurrent.c \
-	eglcurrent.h \
-	egldefines.h \
-	egldisplay.c \
-	egldisplay.h \
-	egldriver.c \
-	egldriver.h \
-	eglfallbacks.c \
-	eglglobals.c \
-	eglglobals.h \
-	eglimage.c \
-	eglimage.h \
-	egllog.c \
-	egllog.h \
-	eglstring.c \
-	eglstring.h \
-	eglsurface.c \
-	eglsurface.h \
-	eglsync.c \
-	eglsync.h \
-	egltypedefs.h
diff --git a/src/egl/main/SConscript b/src/egl/main/SConscript
deleted file mode 100644
index c0012831bb9..00000000000
--- a/src/egl/main/SConscript
+++ /dev/null
@@ -1,52 +0,0 @@
-#######################################################################
-# SConscript for EGL
-
-
-Import('*')
-
-env = env.Clone()
-
-env.Append(CPPDEFINES = [
-    '_EGL_DRIVER_SEARCH_DIR=\\"\\"',
-])
-
-if env['platform'] == 'haiku':
-    env.Append(CPPDEFINES = [
-        '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
-        '_EGL_OS_UNIX',
-        '_EGL_BUILT_IN_DRIVER_HAIKU',
-    ])
-    env.Prepend(LIBS = [
-        egl_haiku,
-        libloader,
-    ])
-else:
-    env.Append(CPPDEFINES = [
-        '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_X11',
-        '_EGL_OS_UNIX',
-    ])
-    if env['dri']:
-        env.Prepend(LIBS = [
-            egl_dri2,
-            libloader,
-        ])
-    # Disallow undefined symbols
-    if env['platform'] != 'darwin':
-        env.Append(SHLINKFLAGS = ['-Wl,-z,defs'])
-
-env.Append(CPPPATH = [
-    '#/include',
-])
-
-
-# parse Makefile.sources
-egl_sources = env.ParseSourceList('Makefile.sources', 'LIBEGL_C_FILES')
-
-egl = env.SharedLibrary(
-    target = 'EGL',
-    source = egl_sources,
-)
-
-egl = env.InstallSharedLibrary(egl, version=(1, 0, 0))
-
-env.Alias('egl', egl)
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 105e919683a..323634e4511 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -100,7 +100,6 @@
 #include "eglconfig.h"
 #include "eglimage.h"
 #include "eglsync.h"
-#include "eglstring.h"
 
 
 /**
@@ -381,48 +380,47 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
 
    char *exts = dpy->ExtensionsString;
 
-   _EGL_CHECK_EXTENSION(MESA_drm_display);
-   _EGL_CHECK_EXTENSION(MESA_drm_image);
-   _EGL_CHECK_EXTENSION(MESA_configless_context);
-
-   _EGL_CHECK_EXTENSION(WL_bind_wayland_display);
-   _EGL_CHECK_EXTENSION(WL_create_wayland_buffer_from_image);
-
-   _EGL_CHECK_EXTENSION(KHR_image_base);
-   _EGL_CHECK_EXTENSION(KHR_image_pixmap);
-   if (dpy->Extensions.KHR_image_base && dpy->Extensions.KHR_image_pixmap)
-      _eglAppendExtension(&exts, "EGL_KHR_image");
-
-   _EGL_CHECK_EXTENSION(KHR_vg_parent_image);
-   _EGL_CHECK_EXTENSION(KHR_get_all_proc_addresses);
-   _EGL_CHECK_EXTENSION(KHR_gl_texture_2D_image);
-   _EGL_CHECK_EXTENSION(KHR_gl_texture_cubemap_image);
-   _EGL_CHECK_EXTENSION(KHR_gl_texture_3D_image);
-   _EGL_CHECK_EXTENSION(KHR_gl_renderbuffer_image);
-
-   _EGL_CHECK_EXTENSION(KHR_reusable_sync);
-   _EGL_CHECK_EXTENSION(KHR_fence_sync);
-   _EGL_CHECK_EXTENSION(KHR_wait_sync);
-   _EGL_CHECK_EXTENSION(KHR_cl_event2);
-
-   _EGL_CHECK_EXTENSION(KHR_surfaceless_context);
-   _EGL_CHECK_EXTENSION(KHR_create_context);
-
-   _EGL_CHECK_EXTENSION(NOK_swap_region);
-   _EGL_CHECK_EXTENSION(NOK_texture_from_pixmap);
-
+   /* Please keep these sorted alphabetically. */
    _EGL_CHECK_EXTENSION(ANDROID_image_native_buffer);
 
    _EGL_CHECK_EXTENSION(CHROMIUM_sync_control);
 
-   _EGL_CHECK_EXTENSION(EXT_create_context_robustness);
    _EGL_CHECK_EXTENSION(EXT_buffer_age);
-   _EGL_CHECK_EXTENSION(EXT_swap_buffers_with_damage);
+   _EGL_CHECK_EXTENSION(EXT_create_context_robustness);
    _EGL_CHECK_EXTENSION(EXT_image_dma_buf_import);
+   _EGL_CHECK_EXTENSION(EXT_swap_buffers_with_damage);
+
+   _EGL_CHECK_EXTENSION(KHR_cl_event2);
+   _EGL_CHECK_EXTENSION(KHR_create_context);
+   _EGL_CHECK_EXTENSION(KHR_fence_sync);
+   _EGL_CHECK_EXTENSION(KHR_get_all_proc_addresses);
+   _EGL_CHECK_EXTENSION(KHR_gl_colorspace);
+   _EGL_CHECK_EXTENSION(KHR_gl_renderbuffer_image);
+   _EGL_CHECK_EXTENSION(KHR_gl_texture_2D_image);
+   _EGL_CHECK_EXTENSION(KHR_gl_texture_3D_image);
+   _EGL_CHECK_EXTENSION(KHR_gl_texture_cubemap_image);
+   if (dpy->Extensions.KHR_image_base && dpy->Extensions.KHR_image_pixmap)
+      _eglAppendExtension(&exts, "EGL_KHR_image");
+   _EGL_CHECK_EXTENSION(KHR_image_base);
+   _EGL_CHECK_EXTENSION(KHR_image_pixmap);
+   _EGL_CHECK_EXTENSION(KHR_reusable_sync);
+   _EGL_CHECK_EXTENSION(KHR_surfaceless_context);
+   _EGL_CHECK_EXTENSION(KHR_vg_parent_image);
+   _EGL_CHECK_EXTENSION(KHR_wait_sync);
+
+   _EGL_CHECK_EXTENSION(MESA_configless_context);
+   _EGL_CHECK_EXTENSION(MESA_drm_display);
+   _EGL_CHECK_EXTENSION(MESA_drm_image);
+   _EGL_CHECK_EXTENSION(MESA_image_dma_buf_export);
+
+   _EGL_CHECK_EXTENSION(NOK_swap_region);
+   _EGL_CHECK_EXTENSION(NOK_texture_from_pixmap);
 
    _EGL_CHECK_EXTENSION(NV_post_sub_buffer);
 
-   _EGL_CHECK_EXTENSION(MESA_image_dma_buf_export);
+   _EGL_CHECK_EXTENSION(WL_bind_wayland_display);
+   _EGL_CHECK_EXTENSION(WL_create_wayland_buffer_from_image);
+
 #undef _EGL_CHECK_EXTENSION
 }
 
@@ -507,7 +505,7 @@ eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor)
       _eglComputeVersion(disp);
       _eglCreateExtensionsString(disp);
       _eglCreateAPIsString(disp);
-      _eglsnprintf(disp->VersionString, sizeof(disp->VersionString),
+      snprintf(disp->VersionString, sizeof(disp->VersionString),
               "%d.%d (%s)", disp->Version / 10, disp->Version % 10,
               disp->Driver->Name);
    }
@@ -1015,8 +1013,6 @@ eglSwapBuffers(EGLDisplay dpy, EGLSurface surface)
 }
 
 
-#ifdef EGL_EXT_swap_buffers_with_damage
-
 static EGLBoolean EGLAPIENTRY
 eglSwapBuffersWithDamageEXT(EGLDisplay dpy, EGLSurface surface,
                             EGLint *rects, EGLint n_rects)
@@ -1042,8 +1038,6 @@ eglSwapBuffersWithDamageEXT(EGLDisplay dpy, EGLSurface surface,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-#endif /* EGL_EXT_swap_buffers_with_damage */
-
 EGLBoolean EGLAPIENTRY
 eglCopyBuffers(EGLDisplay dpy, EGLSurface surface, EGLNativePixmapType target)
 {
@@ -1204,8 +1198,6 @@ eglGetError(void)
 }
 
 
-#ifdef EGL_MESA_drm_display
-
 static EGLDisplay EGLAPIENTRY
 eglGetDRMDisplayMESA(int fd)
 {
@@ -1213,8 +1205,6 @@ eglGetDRMDisplayMESA(int fd)
    return _eglGetDisplayHandle(dpy);
 }
 
-#endif /* EGL_MESA_drm_display */
-
 /**
  ** EGL 1.2
  **/
@@ -1580,8 +1570,6 @@ eglGetSyncAttribKHR(EGLDisplay dpy, EGLSync sync, EGLint attribute, EGLint *valu
 }
 
 
-#ifdef EGL_NOK_swap_region
-
 static EGLBoolean EGLAPIENTRY
 eglSwapBuffersRegionNOK(EGLDisplay dpy, EGLSurface surface,
 			EGLint numRects, const EGLint *rects)
@@ -1607,10 +1595,6 @@ eglSwapBuffersRegionNOK(EGLDisplay dpy, EGLSurface surface,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-#endif /* EGL_NOK_swap_region */
-
-
-#ifdef EGL_MESA_drm_image
 
 static EGLImage EGLAPIENTRY
 eglCreateDRMImageMESA(EGLDisplay dpy, const EGLint *attr_list)
@@ -1650,9 +1634,7 @@ eglExportDRMImageMESA(EGLDisplay dpy, EGLImage image,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-#endif
 
-#ifdef EGL_WL_bind_wayland_display
 struct wl_display;
 
 static EGLBoolean EGLAPIENTRY
@@ -1709,9 +1691,8 @@ eglQueryWaylandBufferWL(EGLDisplay dpy, struct wl_resource *buffer,
 
    RETURN_EGL_EVAL(disp, ret);
 }
-#endif
 
-#ifdef EGL_WL_create_wayland_buffer_from_image
+
 static struct wl_buffer * EGLAPIENTRY
 eglCreateWaylandBufferFromImageWL(EGLDisplay dpy, EGLImage image)
 {
@@ -1732,7 +1713,6 @@ eglCreateWaylandBufferFromImageWL(EGLDisplay dpy, EGLImage image)
 
    RETURN_EGL_EVAL(disp, ret);
 }
-#endif
 
 static EGLBoolean EGLAPIENTRY
 eglPostSubBufferNV(EGLDisplay dpy, EGLSurface surface,
@@ -1775,7 +1755,6 @@ eglGetSyncValuesCHROMIUM(EGLDisplay display, EGLSurface surface,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-#ifdef EGL_MESA_image_dma_buf_export
 static EGLBoolean EGLAPIENTRY
 eglExportDMABUFImageQueryMESA(EGLDisplay dpy, EGLImage image,
                               EGLint *fourcc, EGLint *nplanes,
@@ -1817,7 +1796,6 @@ eglExportDMABUFImageMESA(EGLDisplay dpy, EGLImage image,
 
    RETURN_EGL_EVAL(disp, ret);
 }
-#endif
 
 __eglMustCastToProperFunctionPointerType EGLAPIENTRY
 eglGetProcAddress(const char *procname)
@@ -1874,9 +1852,7 @@ eglGetProcAddress(const char *procname)
       { "eglGetPlatformDisplay", (_EGLProc) eglGetPlatformDisplay },
       { "eglCreatePlatformWindowSurface", (_EGLProc) eglCreatePlatformWindowSurface },
       { "eglCreatePlatformPixmapSurface", (_EGLProc) eglCreatePlatformPixmapSurface },
-#ifdef EGL_MESA_drm_display
       { "eglGetDRMDisplayMESA", (_EGLProc) eglGetDRMDisplayMESA },
-#endif
       { "eglCreateImageKHR", (_EGLProc) eglCreateImageKHR },
       { "eglDestroyImageKHR", (_EGLProc) eglDestroyImage },
       { "eglCreateSyncKHR", (_EGLProc) eglCreateSyncKHR },
@@ -1886,33 +1862,21 @@ eglGetProcAddress(const char *procname)
       { "eglWaitSyncKHR", (_EGLProc) eglWaitSyncKHR },
       { "eglSignalSyncKHR", (_EGLProc) eglSignalSyncKHR },
       { "eglGetSyncAttribKHR", (_EGLProc) eglGetSyncAttribKHR },
-#ifdef EGL_NOK_swap_region
       { "eglSwapBuffersRegionNOK", (_EGLProc) eglSwapBuffersRegionNOK },
-#endif
-#ifdef EGL_MESA_drm_image
       { "eglCreateDRMImageMESA", (_EGLProc) eglCreateDRMImageMESA },
       { "eglExportDRMImageMESA", (_EGLProc) eglExportDRMImageMESA },
-#endif
-#ifdef EGL_WL_bind_wayland_display
       { "eglBindWaylandDisplayWL", (_EGLProc) eglBindWaylandDisplayWL },
       { "eglUnbindWaylandDisplayWL", (_EGLProc) eglUnbindWaylandDisplayWL },
       { "eglQueryWaylandBufferWL", (_EGLProc) eglQueryWaylandBufferWL },
-#endif
-#ifdef EGL_WL_create_wayland_buffer_from_image
       { "eglCreateWaylandBufferFromImageWL", (_EGLProc) eglCreateWaylandBufferFromImageWL },
-#endif
       { "eglPostSubBufferNV", (_EGLProc) eglPostSubBufferNV },
-#ifdef EGL_EXT_swap_buffers_with_damage
       { "eglSwapBuffersWithDamageEXT", (_EGLProc) eglSwapBuffersWithDamageEXT },
-#endif
       { "eglGetPlatformDisplayEXT", (_EGLProc) eglGetPlatformDisplayEXT },
       { "eglCreatePlatformWindowSurfaceEXT", (_EGLProc) eglCreatePlatformWindowSurfaceEXT },
       { "eglCreatePlatformPixmapSurfaceEXT", (_EGLProc) eglCreatePlatformPixmapSurfaceEXT },
       { "eglGetSyncValuesCHROMIUM", (_EGLProc) eglGetSyncValuesCHROMIUM },
-#ifdef EGL_MESA_image_dma_buf_export
       { "eglExportDMABUFImageQueryMESA", (_EGLProc) eglExportDMABUFImageQueryMESA },
       { "eglExportDMABUFImageMESA", (_EGLProc) eglExportDMABUFImageMESA },
-#endif
       { NULL, NULL }
    };
    EGLint i;
diff --git a/src/egl/main/eglapi.h b/src/egl/main/eglapi.h
index 4e0378d0d5f..6c54c7c410d 100644
--- a/src/egl/main/eglapi.h
+++ b/src/egl/main/eglapi.h
@@ -99,41 +99,29 @@ typedef EGLBoolean (*SignalSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSyn
 typedef EGLBoolean (*GetSyncAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint attribute, EGLAttrib *value);
 
 
-#ifdef EGL_NOK_swap_region
 typedef EGLBoolean (*SwapBuffersRegionNOK_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf, EGLint numRects, const EGLint *rects);
-#endif
 
-#ifdef EGL_MESA_drm_image
 typedef _EGLImage *(*CreateDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, const EGLint *attr_list);
 typedef EGLBoolean (*ExportDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *name, EGLint *handle, EGLint *stride);
-#endif
 
-#ifdef EGL_WL_bind_wayland_display
 struct wl_display;
 typedef EGLBoolean (*BindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display);
 typedef EGLBoolean (*UnbindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display);
 typedef EGLBoolean (*QueryWaylandBufferWL_t)(_EGLDriver *drv, _EGLDisplay *displ, struct wl_resource *buffer, EGLint attribute, EGLint *value);
-#endif
 
-#ifdef EGL_WL_create_wayland_buffer_from_image
 typedef struct wl_buffer * (*CreateWaylandBufferFromImageWL_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img);
-#endif
 
 typedef EGLBoolean (*PostSubBufferNV_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surface, EGLint x, EGLint y, EGLint width, EGLint height);
 
 typedef EGLint (*QueryBufferAge_t)(_EGLDriver *drv,
                                    _EGLDisplay *dpy, _EGLSurface *surface);
 
-#ifdef EGL_EXT_swap_buffers_with_damage
 typedef EGLBoolean (*SwapBuffersWithDamageEXT_t) (_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, const EGLint *rects, EGLint n_rects);
-#endif
 
 typedef EGLBoolean (*GetSyncValuesCHROMIUM_t) (_EGLDisplay *dpy, _EGLSurface *surface, EGLuint64KHR *ust, EGLuint64KHR *msc, EGLuint64KHR *sbc);
 
-#ifdef EGL_MESA_image_dma_buf_export
 typedef EGLBoolean (*ExportDMABUFImageQueryMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *fourcc, EGLint *nplanes, EGLuint64KHR *modifiers);
 typedef EGLBoolean (*ExportDMABUFImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *fds, EGLint *strides, EGLint *offsets);
-#endif
 
 /**
  * The API dispatcher jumps through these functions
@@ -180,38 +168,26 @@ struct _egl_api
    SignalSyncKHR_t SignalSyncKHR;
    GetSyncAttrib_t GetSyncAttrib;
 
-#ifdef EGL_NOK_swap_region
    SwapBuffersRegionNOK_t SwapBuffersRegionNOK;
-#endif
 
-#ifdef EGL_MESA_drm_image
    CreateDRMImageMESA_t CreateDRMImageMESA;
    ExportDRMImageMESA_t ExportDRMImageMESA;
-#endif
 
-#ifdef EGL_WL_bind_wayland_display
    BindWaylandDisplayWL_t BindWaylandDisplayWL;
    UnbindWaylandDisplayWL_t UnbindWaylandDisplayWL;
    QueryWaylandBufferWL_t QueryWaylandBufferWL;
-#endif
 
-#ifdef EGL_WL_create_wayland_buffer_from_image
    CreateWaylandBufferFromImageWL_t CreateWaylandBufferFromImageWL;
-#endif
 
-#ifdef EGL_EXT_swap_buffers_with_damage
    SwapBuffersWithDamageEXT_t SwapBuffersWithDamageEXT;
-#endif /* EGL_EXT_swap_buffers_with_damage */
 
    PostSubBufferNV_t PostSubBufferNV;
 
    QueryBufferAge_t QueryBufferAge;
    GetSyncValuesCHROMIUM_t GetSyncValuesCHROMIUM;
 
-#ifdef EGL_MESA_image_dma_buf_export
    ExportDMABUFImageQueryMESA_t ExportDMABUFImageQueryMESA;
    ExportDMABUFImageMESA_t ExportDMABUFImageMESA;
-#endif
 };
 
 
diff --git a/src/egl/main/eglarray.c b/src/egl/main/eglarray.c
index 3ccc8a649f0..d2f39af49a6 100644
--- a/src/egl/main/eglarray.c
+++ b/src/egl/main/eglarray.c
@@ -197,6 +197,9 @@ _eglFlattenArray(_EGLArray *array, void *buffer, EGLint elem_size, EGLint size,
 
    count = array->Size;
    if (buffer) {
+      /* clamp size to 0 */
+      if (size < 0)
+         size = 0;
       /* do not exceed buffer size */
       if (count > size)
          count = size;
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index cf65c69b7b4..c445d9b0c92 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -83,7 +83,8 @@ _eglLinkConfig(_EGLConfig *conf)
    _EGLDisplay *dpy = conf->Display;
 
    /* sanity check */
-   assert(dpy && conf->ConfigID > 0);
+   assert(dpy);
+   assert(conf->ConfigID > 0);
 
    if (!dpy->Configs) {
       dpy->Configs = _eglCreateArray("Config", 16);
diff --git a/src/egl/main/eglcontext.c b/src/egl/main/eglcontext.c
index e767f4b1abe..588f48921f2 100644
--- a/src/egl/main/eglcontext.c
+++ b/src/egl/main/eglcontext.c
@@ -101,11 +101,42 @@ _eglParseContextAttribList(_EGLContext *ctx, _EGLDisplay *dpy,
 
       switch (attr) {
       case EGL_CONTEXT_CLIENT_VERSION:
+         /* The EGL 1.4 spec says:
+          *
+          *     "attribute EGL_CONTEXT_CLIENT_VERSION is only valid when the
+          *      current rendering API is EGL_OPENGL_ES_API"
+          *
+          * The EGL_KHR_create_context spec says:
+          *
+          *     "EGL_CONTEXT_MAJOR_VERSION_KHR           0x3098
+          *      (this token is an alias for EGL_CONTEXT_CLIENT_VERSION)"
+          *
+          *     "The values for attributes EGL_CONTEXT_MAJOR_VERSION_KHR and
+          *      EGL_CONTEXT_MINOR_VERSION_KHR specify the requested client API
+          *      version. They are only meaningful for OpenGL and OpenGL ES
+          *      contexts, and specifying them for other types of contexts will
+          *      generate an error."
+          */
+         if ((api != EGL_OPENGL_ES_API &&
+             (!dpy->Extensions.KHR_create_context || api != EGL_OPENGL_API))) {
+               err = EGL_BAD_ATTRIBUTE;
+               break;
+         }
+
          ctx->ClientMajorVersion = val;
          break;
 
       case EGL_CONTEXT_MINOR_VERSION_KHR:
-         if (!dpy->Extensions.KHR_create_context) {
+         /* The EGL_KHR_create_context spec says:
+          *
+          *     "The values for attributes EGL_CONTEXT_MAJOR_VERSION_KHR and
+          *      EGL_CONTEXT_MINOR_VERSION_KHR specify the requested client API
+          *      version. They are only meaningful for OpenGL and OpenGL ES
+          *      contexts, and specifying them for other types of contexts will
+          *      generate an error."
+          */
+         if (!dpy->Extensions.KHR_create_context ||
+             (api != EGL_OPENGL_ES_API && api != EGL_OPENGL_API)) {
             err = EGL_BAD_ATTRIBUTE;
             break;
          }
diff --git a/src/egl/main/egldisplay.c b/src/egl/main/egldisplay.c
index 24a0c7e61a7..f6db03ab50c 100644
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -65,11 +65,9 @@ static const struct {
    _EGLPlatformType platform;
    const char *name;
 } egl_platforms[_EGL_NUM_PLATFORMS] = {
-   { _EGL_PLATFORM_WINDOWS, "gdi" },
    { _EGL_PLATFORM_X11, "x11" },
    { _EGL_PLATFORM_WAYLAND, "wayland" },
    { _EGL_PLATFORM_DRM, "drm" },
-   { _EGL_PLATFORM_NULL, "null" },
    { _EGL_PLATFORM_ANDROID, "android" },
    { _EGL_PLATFORM_HAIKU, "haiku" },
    { _EGL_PLATFORM_SURFACELESS, "surfaceless" },
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index 0b50a36a098..6c64980cf20 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -44,11 +44,9 @@ extern "C" {
 #endif
 
 enum _egl_platform_type {
-   _EGL_PLATFORM_WINDOWS,
    _EGL_PLATFORM_X11,
    _EGL_PLATFORM_WAYLAND,
    _EGL_PLATFORM_DRM,
-   _EGL_PLATFORM_NULL,
    _EGL_PLATFORM_ANDROID,
    _EGL_PLATFORM_HAIKU,
    _EGL_PLATFORM_SURFACELESS,
@@ -91,46 +89,44 @@ struct _egl_resource
  */
 struct _egl_extensions
 {
-   EGLBoolean MESA_drm_display;
-   EGLBoolean MESA_drm_image;
-   EGLBoolean MESA_configless_context;
-
-   EGLBoolean WL_bind_wayland_display;
-   EGLBoolean WL_create_wayland_buffer_from_image;
-
-   EGLBoolean KHR_image_base;
-   EGLBoolean KHR_image_pixmap;
-   EGLBoolean KHR_vg_parent_image;
-   EGLBoolean KHR_get_all_proc_addresses;
-   EGLBoolean KHR_gl_colorspace;
-   EGLBoolean KHR_gl_texture_2D_image;
-   EGLBoolean KHR_gl_texture_cubemap_image;
-   EGLBoolean KHR_gl_texture_3D_image;
-   EGLBoolean KHR_gl_renderbuffer_image;
-
-   EGLBoolean KHR_reusable_sync;
-   EGLBoolean KHR_fence_sync;
-   EGLBoolean KHR_wait_sync;
-   EGLBoolean KHR_cl_event2;
-
-   EGLBoolean KHR_surfaceless_context;
-   EGLBoolean KHR_create_context;
-
-   EGLBoolean NOK_swap_region;
-   EGLBoolean NOK_texture_from_pixmap;
-
+   /* Please keep these sorted alphabetically. */
    EGLBoolean ANDROID_image_native_buffer;
 
    EGLBoolean CHROMIUM_sync_control;
 
+   EGLBoolean EXT_buffer_age;
+   EGLBoolean EXT_create_context_robustness;
+   EGLBoolean EXT_image_dma_buf_import;
+   EGLBoolean EXT_swap_buffers_with_damage;
+
+   EGLBoolean KHR_cl_event2;
+   EGLBoolean KHR_create_context;
+   EGLBoolean KHR_fence_sync;
+   EGLBoolean KHR_get_all_proc_addresses;
+   EGLBoolean KHR_gl_colorspace;
+   EGLBoolean KHR_gl_renderbuffer_image;
+   EGLBoolean KHR_gl_texture_2D_image;
+   EGLBoolean KHR_gl_texture_3D_image;
+   EGLBoolean KHR_gl_texture_cubemap_image;
+   EGLBoolean KHR_image_base;
+   EGLBoolean KHR_image_pixmap;
+   EGLBoolean KHR_reusable_sync;
+   EGLBoolean KHR_surfaceless_context;
+   EGLBoolean KHR_vg_parent_image;
+   EGLBoolean KHR_wait_sync;
+
+   EGLBoolean MESA_configless_context;
+   EGLBoolean MESA_drm_display;
+   EGLBoolean MESA_drm_image;
+   EGLBoolean MESA_image_dma_buf_export;
+
+   EGLBoolean NOK_swap_region;
+   EGLBoolean NOK_texture_from_pixmap;
+
    EGLBoolean NV_post_sub_buffer;
 
-   EGLBoolean EXT_create_context_robustness;
-   EGLBoolean EXT_buffer_age;
-   EGLBoolean EXT_swap_buffers_with_damage;
-   EGLBoolean EXT_image_dma_buf_import;
-
-   EGLBoolean MESA_image_dma_buf_export;
+   EGLBoolean WL_bind_wayland_display;
+   EGLBoolean WL_create_wayland_buffer_from_image;
 };
 
 
diff --git a/src/egl/main/egldriver.c b/src/egl/main/egldriver.c
index 6ef79d96502..b9b21dec5ea 100644
--- a/src/egl/main/egldriver.c
+++ b/src/egl/main/egldriver.c
@@ -39,7 +39,6 @@
 #include <stdlib.h>
 #include "c11/threads.h"
 
-#include "eglstring.h"
 #include "egldefines.h"
 #include "egldisplay.h"
 #include "egldriver.h"
@@ -97,15 +96,10 @@ _eglLoadModule(_EGLModule *mod)
 static void
 _eglUnloadModule(_EGLModule *mod)
 {
-#if defined(_EGL_OS_UNIX)
    /* destroy the driver */
    if (mod->Driver && mod->Driver->Unload)
       mod->Driver->Unload(mod->Driver);
 
-#elif defined(_EGL_OS_WINDOWS)
-   /* XXX Windows unloads DLLs before atexit */
-#endif
-
    mod->Driver = NULL;
 }
 
@@ -135,7 +129,7 @@ _eglAddModule(const char *name)
    /* allocate a new one */
    mod = calloc(1, sizeof(*mod));
    if (mod) {
-      mod->Name = _eglstrdup(name);
+      mod->Name = strdup(name);
       if (!mod->Name) {
          free(mod);
          mod = NULL;
diff --git a/src/egl/main/eglfallbacks.c b/src/egl/main/eglfallbacks.c
index 3c3701f4ae9..65daf8fd0f5 100644
--- a/src/egl/main/eglfallbacks.c
+++ b/src/egl/main/eglfallbacks.c
@@ -93,17 +93,11 @@ _eglInitDriverFallbacks(_EGLDriver *drv)
    drv->API.SignalSyncKHR = NULL;
    drv->API.GetSyncAttrib = _eglGetSyncAttrib;
 
-#ifdef EGL_MESA_drm_image
    drv->API.CreateDRMImageMESA = NULL;
    drv->API.ExportDRMImageMESA = NULL;
-#endif
 
-#ifdef EGL_NOK_swap_region
    drv->API.SwapBuffersRegionNOK = NULL;
-#endif
 
-#ifdef EGL_MESA_image_dma_buf_export
    drv->API.ExportDMABUFImageQueryMESA = NULL;
    drv->API.ExportDMABUFImageMESA = NULL;
-#endif
 }
diff --git a/src/egl/main/eglglobals.c b/src/egl/main/eglglobals.c
index 884cff0c36b..938d9537891 100644
--- a/src/egl/main/eglglobals.c
+++ b/src/egl/main/eglglobals.c
@@ -53,10 +53,10 @@ struct _egl_global _eglGlobal =
    /* ClientExtensionsString */
    "EGL_EXT_client_extensions"
    " EGL_EXT_platform_base"
-   " EGL_EXT_platform_x11"
    " EGL_EXT_platform_wayland"
-   " EGL_MESA_platform_gbm"
+   " EGL_EXT_platform_x11"
    " EGL_KHR_client_get_all_proc_addresses"
+   " EGL_MESA_platform_gbm"
 };
 
 
diff --git a/src/egl/main/egllog.c b/src/egl/main/egllog.c
index 1877d8bfd10..956946532cd 100644
--- a/src/egl/main/egllog.c
+++ b/src/egl/main/egllog.c
@@ -38,10 +38,11 @@
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <strings.h>
 #include "c11/threads.h"
 
 #include "egllog.h"
-#include "eglstring.h"
 
 #define MAXSTRING 1000
 #define FALLBACK_LOG_LEVEL _EGL_WARNING
@@ -146,7 +147,7 @@ _eglInitLogger(void)
    log_env = getenv("EGL_LOG_LEVEL");
    if (log_env) {
       for (i = 0; level_strings[i]; i++) {
-         if (_eglstrcasecmp(log_env, level_strings[i]) == 0) {
+         if (strcasecmp(log_env, level_strings[i]) == 0) {
             level = i;
             break;
          }
diff --git a/src/egl/main/eglstring.c b/src/egl/main/eglstring.c
deleted file mode 100644
index 8b4c491ac64..00000000000
--- a/src/egl/main/eglstring.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010-2011 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * String utils.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include "eglstring.h"
-
-
-char *
-_eglstrdup(const char *s)
-{
-   if (s) {
-      size_t l = strlen(s);
-      char *s2 = malloc(l + 1);
-      if (s2)
-         strcpy(s2, s);
-      return s2;
-   }
-   return NULL;
-}
-
-
-
diff --git a/src/egl/main/eglstring.h b/src/egl/main/eglstring.h
deleted file mode 100644
index 16baa477714..00000000000
--- a/src/egl/main/eglstring.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010-2011 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef EGLSTRING_INCLUDED
-#define EGLSTRING_INCLUDED
-
-#include <string.h>
-#include <stdio.h>
-
-#ifdef _EGL_OS_WINDOWS
-#define _eglstrcasecmp _stricmp
-#define _eglsnprintf _snprintf
-#else
-#include <strings.h> // for strcasecmp
-#define _eglstrcasecmp strcasecmp
-#define _eglsnprintf snprintf
-#endif
-
-extern char *
-_eglstrdup(const char *s);
-
-
-#endif /* EGLSTRING_INCLUDED */
diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c
index 76c60e940dc..4fa43f3e2b1 100644
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -84,6 +84,22 @@ _eglParseSurfaceAttribList(_EGLSurface *surf, const EGLint *attrib_list)
 
       switch (attr) {
       /* common attributes */
+      case EGL_GL_COLORSPACE_KHR:
+         if (!dpy->Extensions.KHR_gl_colorspace) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+         switch (val) {
+         case EGL_GL_COLORSPACE_SRGB_KHR:
+         case EGL_GL_COLORSPACE_LINEAR_KHR:
+            break;
+         default:
+            err = EGL_BAD_ATTRIBUTE;
+         }
+         if (err != EGL_SUCCESS)
+            break;
+         surf->GLColorspace = val;
+         break;
       case EGL_VG_COLORSPACE:
          switch (val) {
          case EGL_VG_COLORSPACE_sRGB:
@@ -272,6 +288,7 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
    surf->RenderBuffer = renderBuffer;
    surf->VGAlphaFormat = EGL_VG_ALPHA_FORMAT_NONPRE;
    surf->VGColorspace = EGL_VG_COLORSPACE_sRGB;
+   surf->GLColorspace = EGL_GL_COLORSPACE_LINEAR_KHR;
 
    surf->MipmapLevel = 0;
    surf->MultisampleResolve = EGL_MULTISAMPLE_RESOLVE_DEFAULT;
@@ -309,7 +326,8 @@ _eglQuerySurface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface,
       *value = surface->Config->ConfigID;
       break;
    case EGL_LARGEST_PBUFFER:
-      *value = surface->LargestPbuffer;
+      if (surface->Type == EGL_PBUFFER_BIT)
+         *value = surface->LargestPbuffer;
       break;
    case EGL_TEXTURE_FORMAT:
       /* texture attributes: only for pbuffers, no error otherwise */
@@ -352,6 +370,13 @@ _eglQuerySurface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface,
    case EGL_VG_COLORSPACE:
       *value = surface->VGColorspace;
       break;
+   case EGL_GL_COLORSPACE_KHR:
+      if (!dpy->Extensions.KHR_gl_colorspace) {
+         _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
+         return EGL_FALSE;
+      }
+      *value = surface->GLColorspace;
+      break;
    case EGL_POST_SUB_BUFFER_SUPPORTED_NV:
       *value = surface->PostSubBufferSupportedNV;
       break;
diff --git a/src/egl/main/eglsurface.h b/src/egl/main/eglsurface.h
index 74c429a9628..fc799ee43dc 100644
--- a/src/egl/main/eglsurface.h
+++ b/src/egl/main/eglsurface.h
@@ -65,6 +65,7 @@ struct _egl_surface
    EGLenum RenderBuffer;
    EGLenum VGAlphaFormat;
    EGLenum VGColorspace;
+   EGLenum GLColorspace;
 
    /* attributes set by eglSurfaceAttrib */
    EGLint MipmapLevel;
diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk
index b946681840c..39e064e9538 100644
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -34,7 +34,7 @@ SUBDIRS := auxiliary
 
 # swrast
 ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
-SUBDIRS += winsys/sw/dri winsys/sw/kms-dri drivers/softpipe
+SUBDIRS += winsys/sw/dri drivers/softpipe
 endif
 
 # freedreno
@@ -72,6 +72,7 @@ SUBDIRS += drivers/r600
 endif
 ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += drivers/radeonsi
+SUBDIRS += winsys/amdgpu/drm
 endif
 endif
 endif
diff --git a/src/gallium/Automake.inc b/src/gallium/Automake.inc
index 95aae50d64b..ee07ab6c8f9 100644
--- a/src/gallium/Automake.inc
+++ b/src/gallium/Automake.inc
@@ -67,10 +67,3 @@ if HAVE_DRISW
 GALLIUM_PIPE_LOADER_WINSYS_LIBS += \
 	$(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la
 endif
-
-if NEED_WINSYS_XLIB
-GALLIUM_PIPE_LOADER_WINSYS_LIBS += \
-	$(top_builddir)/src/gallium/winsys/sw/xlib/libws_xlib.la \
-	-lX11 -lXext -lXfixes \
-	$(LIBDRM_LIBS)
-endif
diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
index ede6e21233a..e2c1090aa26 100644
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@@ -58,6 +58,7 @@ endif
 ## radeonsi
 if HAVE_GALLIUM_RADEONSI
 SUBDIRS += drivers/radeonsi
+SUBDIRS += winsys/amdgpu/drm
 endif
 
 ## the radeon winsys - linked in by r300, r600 and radeonsi
diff --git a/src/gallium/README.portability b/src/gallium/README.portability
index adecf4bb798..cf6cc36afbb 100644
--- a/src/gallium/README.portability
+++ b/src/gallium/README.portability
@@ -13,8 +13,6 @@ headers in general, should strictly follow these guidelines to ensure
 
 * Include the p_compiler.h.
 
-* Don't use the 'inline' keyword, use the INLINE macro in p_compiler.h instead.
-
 * Cast explicitly when converting to integer types of smaller sizes.
 
 * Cast explicitly when converting between float, double and integral types.
diff --git a/src/gallium/SConscript b/src/gallium/SConscript
index eeb1c780fcd..fa5fa6e8734 100644
--- a/src/gallium/SConscript
+++ b/src/gallium/SConscript
@@ -46,7 +46,6 @@ if env['platform'] == 'haiku':
 if env['dri']:
     SConscript([
         'winsys/sw/dri/SConscript',
-        'winsys/sw/kms-dri/SConscript',
         'winsys/svga/drm/SConscript',
     ])
 
diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am
index 89c7a13e913..04f77d002c8 100644
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 if HAVE_LOADER_GALLIUM
 SUBDIRS := pipe-loader
 endif
@@ -10,6 +8,7 @@ include $(top_srcdir)/src/gallium/Automake.inc
 noinst_LTLIBRARIES = libgallium.la
 
 AM_CFLAGS = \
+	-I$(top_srcdir)/src/loader \
 	-I$(top_builddir)/src/glsl/nir \
 	-I$(top_srcdir)/src/gallium/auxiliary/util \
 	$(GALLIUM_CFLAGS) \
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 62e6b94cab8..3616d885b47 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -274,7 +274,6 @@ C_SOURCES := \
 	util/u_simple_shaders.h \
 	util/u_slab.c \
 	util/u_slab.h \
-	util/u_snprintf.c \
 	util/u_split_prim.h \
 	util/u_sse.h \
 	util/u_staging.c \
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index dd56e4a154e..d36f1fbd717 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -80,7 +80,7 @@ unsigned cso_construct_key(void *item, int item_size)
    return hash_key((item), item_size);
 }
 
-static INLINE struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_type type)
+static inline struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_type type)
 {
    struct cso_hash *hash;
    hash = sc->hashes[type];
@@ -127,7 +127,7 @@ static void delete_velements(void *state, void *data)
    FREE(state);
 }
 
-static INLINE void delete_cso(void *state, enum cso_cache_type type)
+static inline void delete_cso(void *state, enum cso_cache_type type)
 {
    switch (type) {
    case CSO_BLEND:
@@ -152,7 +152,7 @@ static INLINE void delete_cso(void *state, enum cso_cache_type type)
 }
 
 
-static INLINE void sanitize_hash(struct cso_cache *sc,
+static inline void sanitize_hash(struct cso_cache *sc,
                                  struct cso_hash *hash,
                                  enum cso_cache_type type,
                                  int max_size)
@@ -162,7 +162,7 @@ static INLINE void sanitize_hash(struct cso_cache *sc,
 }
 
 
-static INLINE void sanitize_cb(struct cso_hash *hash, enum cso_cache_type type,
+static inline void sanitize_cb(struct cso_hash *hash, enum cso_cache_type type,
                                int max_size, void *user_data)
 {
    /* if we're approach the maximum size, remove fourth of the entries
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 744b00cbd92..00686d2af41 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -56,22 +56,8 @@
  */
 struct sampler_info
 {
-   struct {
-      void *samplers[PIPE_MAX_SAMPLERS];
-      unsigned nr_samplers;
-   } hw;
-
    void *samplers[PIPE_MAX_SAMPLERS];
    unsigned nr_samplers;
-
-   void *samplers_saved[PIPE_MAX_SAMPLERS];
-   unsigned nr_samplers_saved;
-
-   struct pipe_sampler_view *views[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   unsigned nr_views;
-
-   struct pipe_sampler_view *views_saved[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   unsigned nr_views_saved;
 };
 
 
@@ -85,6 +71,15 @@ struct cso_context {
    boolean has_tessellation;
    boolean has_streamout;
 
+   struct pipe_sampler_view *fragment_views[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   unsigned nr_fragment_views;
+
+   struct pipe_sampler_view *fragment_views_saved[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   unsigned nr_fragment_views_saved;
+
+   void *fragment_samplers_saved[PIPE_MAX_SAMPLERS];
+   unsigned nr_fragment_samplers_saved;
+
    struct sampler_info samplers[PIPE_SHADER_TYPES];
 
    struct pipe_vertex_buffer aux_vertex_buffer_current;
@@ -116,9 +111,6 @@ struct cso_context {
    uint render_condition_mode, render_condition_mode_saved;
    boolean render_condition_cond, render_condition_cond_saved;
 
-   struct pipe_clip_state clip;
-   struct pipe_clip_state clip_saved;
-
    struct pipe_framebuffer_state fb, fb_saved;
    struct pipe_viewport_state vp, vp_saved;
    struct pipe_blend_color blend_color;
@@ -192,7 +184,7 @@ static boolean delete_vertex_elements(struct cso_context *ctx,
 }
 
 
-static INLINE boolean delete_cso(struct cso_context *ctx,
+static inline boolean delete_cso(struct cso_context *ctx,
                                  void *state, enum cso_cache_type type)
 {
    switch (type) {
@@ -213,7 +205,7 @@ static INLINE boolean delete_cso(struct cso_context *ctx,
    return FALSE;
 }
 
-static INLINE void
+static inline void
 sanitize_hash(struct cso_hash *hash, enum cso_cache_type type,
               int max_size, void *user_data)
 {
@@ -297,7 +289,7 @@ out:
  */
 void cso_destroy_context( struct cso_context *ctx )
 {
-   unsigned i, shader;
+   unsigned i;
 
    if (ctx->pipe) {
       ctx->pipe->set_index_buffer(ctx->pipe, NULL);
@@ -347,13 +339,9 @@ void cso_destroy_context( struct cso_context *ctx )
          ctx->pipe->set_stream_output_targets(ctx->pipe, 0, NULL, NULL);
    }
 
-   /* free sampler views for each shader stage */
-   for (shader = 0; shader < Elements(ctx->samplers); shader++) {
-      struct sampler_info *info = &ctx->samplers[shader];
-      for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
-         pipe_sampler_view_reference(&info->views[i], NULL);
-         pipe_sampler_view_reference(&info->views_saved[i], NULL);
-      }
+   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+      pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
+      pipe_sampler_view_reference(&ctx->fragment_views_saved[i], NULL);
    }
 
    util_unreference_framebuffer_state(&ctx->fb);
@@ -919,47 +907,6 @@ void cso_restore_tesseval_shader(struct cso_context *ctx)
    ctx->tesseval_shader_saved = NULL;
 }
 
-/* clip state */
-
-static INLINE void
-clip_state_cpy(struct pipe_clip_state *dst,
-               const struct pipe_clip_state *src)
-{
-   memcpy(dst->ucp, src->ucp, sizeof(dst->ucp));
-}
-
-static INLINE int
-clip_state_cmp(const struct pipe_clip_state *a,
-               const struct pipe_clip_state *b)
-{
-   return memcmp(a->ucp, b->ucp, sizeof(a->ucp));
-}
-
-void
-cso_set_clip(struct cso_context *ctx,
-             const struct pipe_clip_state *clip)
-{
-   if (clip_state_cmp(&ctx->clip, clip)) {
-      clip_state_cpy(&ctx->clip, clip);
-      ctx->pipe->set_clip_state(ctx->pipe, clip);
-   }
-}
-
-void
-cso_save_clip(struct cso_context *ctx)
-{
-   clip_state_cpy(&ctx->clip_saved, &ctx->clip);
-}
-
-void
-cso_restore_clip(struct cso_context *ctx)
-{
-   if (clip_state_cmp(&ctx->clip, &ctx->clip_saved)) {
-      clip_state_cpy(&ctx->clip, &ctx->clip_saved);
-      ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip_saved);
-   }
-}
-
 enum pipe_error
 cso_set_vertex_elements(struct cso_context *ctx,
                         unsigned count,
@@ -1122,11 +1069,9 @@ unsigned cso_get_aux_vertex_buffer_slot(struct cso_context *ctx)
 
 /**************** fragment/vertex sampler view state *************************/
 
-static enum pipe_error
-single_sampler(struct cso_context *ctx,
-               struct sampler_info *info,
-               unsigned idx,
-               const struct pipe_sampler_state *templ)
+enum pipe_error
+cso_single_sampler(struct cso_context *ctx, unsigned shader_stage,
+                   unsigned idx, const struct pipe_sampler_state *templ)
 {
    void *handle = NULL;
 
@@ -1162,24 +1107,13 @@ single_sampler(struct cso_context *ctx,
       }
    }
 
-   info->samplers[idx] = handle;
-
+   ctx->samplers[shader_stage].samplers[idx] = handle;
    return PIPE_OK;
 }
 
-enum pipe_error
-cso_single_sampler(struct cso_context *ctx,
-                   unsigned shader_stage,
-                   unsigned idx,
-                   const struct pipe_sampler_state *templ)
-{
-   return single_sampler(ctx, &ctx->samplers[shader_stage], idx, templ);
-}
 
-
-
-static void
-single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
+void
+cso_single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
 {
    struct sampler_info *info = &ctx->samplers[shader_stage];
    unsigned i;
@@ -1191,33 +1125,8 @@ single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
    }
 
    info->nr_samplers = i;
-
-   if (info->hw.nr_samplers != info->nr_samplers ||
-       memcmp(info->hw.samplers,
-              info->samplers,
-              info->nr_samplers * sizeof(void *)) != 0)
-   {
-      memcpy(info->hw.samplers,
-             info->samplers,
-             info->nr_samplers * sizeof(void *));
-
-      /* set remaining slots/pointers to null */
-      for (i = info->nr_samplers; i < info->hw.nr_samplers; i++)
-         info->samplers[i] = NULL;
-
-      ctx->pipe->bind_sampler_states(ctx->pipe, shader_stage, 0,
-                                     MAX2(info->nr_samplers,
-                                          info->hw.nr_samplers),
-                                     info->samplers);
-
-      info->hw.nr_samplers = info->nr_samplers;
-   }
-}
-
-void
-cso_single_sampler_done(struct cso_context *ctx, unsigned shader_stage)
-{
-   single_sampler_done(ctx, shader_stage);
+   ctx->pipe->bind_sampler_states(ctx->pipe, shader_stage, 0, i,
+                                  info->samplers);
 }
 
 
@@ -1240,38 +1149,42 @@ cso_set_samplers(struct cso_context *ctx,
     */
 
    for (i = 0; i < nr; i++) {
-      temp = single_sampler(ctx, info, i, templates[i]);
+      temp = cso_single_sampler(ctx, shader_stage, i, templates[i]);
       if (temp != PIPE_OK)
          error = temp;
    }
 
    for ( ; i < info->nr_samplers; i++) {
-      temp = single_sampler(ctx, info, i, NULL);
+      temp = cso_single_sampler(ctx, shader_stage, i, NULL);
       if (temp != PIPE_OK)
          error = temp;
    }
 
-   single_sampler_done(ctx, shader_stage);
+   cso_single_sampler_done(ctx, shader_stage);
 
    return error;
 }
 
 void
-cso_save_samplers(struct cso_context *ctx, unsigned shader_stage)
+cso_save_fragment_samplers(struct cso_context *ctx)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   info->nr_samplers_saved = info->nr_samplers;
-   memcpy(info->samplers_saved, info->samplers, sizeof(info->samplers));
+   struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];
+
+   ctx->nr_fragment_samplers_saved = info->nr_samplers;
+   memcpy(ctx->fragment_samplers_saved, info->samplers,
+          sizeof(info->samplers));
 }
 
 
 void
-cso_restore_samplers(struct cso_context *ctx, unsigned shader_stage)
+cso_restore_fragment_samplers(struct cso_context *ctx)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   info->nr_samplers = info->nr_samplers_saved;
-   memcpy(info->samplers, info->samplers_saved, sizeof(info->samplers));
-   single_sampler_done(ctx, shader_stage);
+   struct sampler_info *info = &ctx->samplers[PIPE_SHADER_FRAGMENT];
+
+   info->nr_samplers = ctx->nr_fragment_samplers_saved;
+   memcpy(info->samplers, ctx->fragment_samplers_saved,
+          sizeof(info->samplers));
+   cso_single_sampler_done(ctx, PIPE_SHADER_FRAGMENT);
 }
 
 
@@ -1281,71 +1194,74 @@ cso_set_sampler_views(struct cso_context *ctx,
                       unsigned count,
                       struct pipe_sampler_view **views)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   unsigned i;
-   boolean any_change = FALSE;
+   if (shader_stage == PIPE_SHADER_FRAGMENT) {
+      unsigned i;
+      boolean any_change = FALSE;
 
-   /* reference new views */
-   for (i = 0; i < count; i++) {
-      any_change |= info->views[i] != views[i];
-      pipe_sampler_view_reference(&info->views[i], views[i]);
-   }
-   /* unref extra old views, if any */
-   for (; i < info->nr_views; i++) {
-      any_change |= info->views[i] != NULL;
-      pipe_sampler_view_reference(&info->views[i], NULL);
-   }
+      /* reference new views */
+      for (i = 0; i < count; i++) {
+         any_change |= ctx->fragment_views[i] != views[i];
+         pipe_sampler_view_reference(&ctx->fragment_views[i], views[i]);
+      }
+      /* unref extra old views, if any */
+      for (; i < ctx->nr_fragment_views; i++) {
+         any_change |= ctx->fragment_views[i] != NULL;
+         pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
+      }
 
-   /* bind the new sampler views */
-   if (any_change) {
-      ctx->pipe->set_sampler_views(ctx->pipe, shader_stage, 0,
-                                   MAX2(info->nr_views, count),
-                                   info->views);
-   }
+      /* bind the new sampler views */
+      if (any_change) {
+         ctx->pipe->set_sampler_views(ctx->pipe, shader_stage, 0,
+                                      MAX2(ctx->nr_fragment_views, count),
+                                      ctx->fragment_views);
+      }
 
-   info->nr_views = count;
+      ctx->nr_fragment_views = count;
+   }
+   else
+      ctx->pipe->set_sampler_views(ctx->pipe, shader_stage, 0, count, views);
 }
 
 
 void
-cso_save_sampler_views(struct cso_context *ctx, unsigned shader_stage)
+cso_save_fragment_sampler_views(struct cso_context *ctx)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
    unsigned i;
 
-   info->nr_views_saved = info->nr_views;
+   ctx->nr_fragment_views_saved = ctx->nr_fragment_views;
 
-   for (i = 0; i < info->nr_views; i++) {
-      assert(!info->views_saved[i]);
-      pipe_sampler_view_reference(&info->views_saved[i], info->views[i]);
+   for (i = 0; i < ctx->nr_fragment_views; i++) {
+      assert(!ctx->fragment_views_saved[i]);
+      pipe_sampler_view_reference(&ctx->fragment_views_saved[i],
+                                  ctx->fragment_views[i]);
    }
 }
 
 
 void
-cso_restore_sampler_views(struct cso_context *ctx, unsigned shader_stage)
+cso_restore_fragment_sampler_views(struct cso_context *ctx)
 {
-   struct sampler_info *info = &ctx->samplers[shader_stage];
-   unsigned i, nr_saved = info->nr_views_saved;
+   unsigned i, nr_saved = ctx->nr_fragment_views_saved;
    unsigned num;
 
    for (i = 0; i < nr_saved; i++) {
-      pipe_sampler_view_reference(&info->views[i], NULL);
+      pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
       /* move the reference from one pointer to another */
-      info->views[i] = info->views_saved[i];
-      info->views_saved[i] = NULL;
+      ctx->fragment_views[i] = ctx->fragment_views_saved[i];
+      ctx->fragment_views_saved[i] = NULL;
    }
-   for (; i < info->nr_views; i++) {
-      pipe_sampler_view_reference(&info->views[i], NULL);
+   for (; i < ctx->nr_fragment_views; i++) {
+      pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
    }
 
-   num = MAX2(info->nr_views, nr_saved);
+   num = MAX2(ctx->nr_fragment_views, nr_saved);
 
    /* bind the old/saved sampler views */
-   ctx->pipe->set_sampler_views(ctx->pipe, shader_stage, 0, num, info->views);
+   ctx->pipe->set_sampler_views(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, num,
+                                ctx->fragment_views);
 
-   info->nr_views = nr_saved;
-   info->nr_views_saved = 0;
+   ctx->nr_fragment_views = nr_saved;
+   ctx->nr_fragment_views_saved = 0;
 }
 
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index cc50b60c6cd..f0a27390d17 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -72,19 +72,17 @@ cso_set_samplers(struct cso_context *cso,
                  const struct pipe_sampler_state **states);
 
 void
-cso_save_samplers(struct cso_context *cso, unsigned shader_stage);
+cso_save_fragment_samplers(struct cso_context *cso);
 
 void
-cso_restore_samplers(struct cso_context *cso, unsigned shader_stage);
+cso_restore_fragment_samplers(struct cso_context *cso);
 
 /* Alternate interface to support state trackers that like to modify
  * samplers one at a time:
  */
 enum pipe_error
-cso_single_sampler(struct cso_context *cso,
-                   unsigned shader_stage,
-                   unsigned count,
-                   const struct pipe_sampler_state *states);
+cso_single_sampler(struct cso_context *cso, unsigned shader_stage,
+                   unsigned idx, const struct pipe_sampler_state *states);
 
 void
 cso_single_sampler_done(struct cso_context *cso, unsigned shader_stage);
@@ -188,19 +186,6 @@ void cso_save_render_condition(struct cso_context *cso);
 void cso_restore_render_condition(struct cso_context *cso);
 
 
-/* clip state */
-
-void
-cso_set_clip(struct cso_context *cso,
-             const struct pipe_clip_state *clip);
-
-void
-cso_save_clip(struct cso_context *cso);
-
-void
-cso_restore_clip(struct cso_context *cso);
-
-
 /* sampler view state */
 
 void
@@ -210,10 +195,10 @@ cso_set_sampler_views(struct cso_context *cso,
                       struct pipe_sampler_view **views);
 
 void
-cso_save_sampler_views(struct cso_context *cso, unsigned shader_stage);
+cso_save_fragment_sampler_views(struct cso_context *ctx);
 
 void
-cso_restore_sampler_views(struct cso_context *cso, unsigned shader_stage);
+cso_restore_fragment_sampler_views(struct cso_context *ctx);
 
 
 /* constant buffers */
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index a1564f93292..c827a68ea0a 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -45,7 +45,7 @@
 /* fixme: move it from here */
 #define MAX_PRIMITIVES 64
 
-static INLINE int
+static inline int
 draw_gs_get_input_index(int semantic, int index,
                         const struct tgsi_shader_info *input_info)
 {
@@ -66,7 +66,7 @@ draw_gs_get_input_index(int semantic, int index,
  * the number of elements in the SOA vector. This ensures that the
  * throughput is optimized for the given vector instruction set.
  */
-static INLINE boolean
+static inline boolean
 draw_gs_should_flush(struct draw_geometry_shader *shader)
 {
    return (shader->fetched_prim_count == shader->vector_length);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 90a31bc6ac0..b1e1bcbee04 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -72,7 +72,7 @@ struct draw_gs_llvm_iface {
    LLVMValueRef input;
 };
 
-static INLINE const struct draw_gs_llvm_iface *
+static inline const struct draw_gs_llvm_iface *
 draw_gs_llvm_iface(const struct lp_build_tgsi_gs_iface *iface)
 {
    return (const struct draw_gs_llvm_iface *)iface;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index d48ed721593..d153c166ead 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -350,7 +350,7 @@ struct draw_gs_llvm_variant_key
     PIPE_MAX_SHADER_SAMPLER_VIEWS * sizeof(struct draw_sampler_static_state))
 
 
-static INLINE size_t
+static inline size_t
 draw_llvm_variant_key_size(unsigned nr_vertex_elements,
                            unsigned nr_samplers)
 {
@@ -360,7 +360,7 @@ draw_llvm_variant_key_size(unsigned nr_vertex_elements,
 }
 
 
-static INLINE size_t
+static inline size_t
 draw_gs_llvm_variant_key_size(unsigned nr_samplers)
 {
    return (sizeof(struct draw_gs_llvm_variant_key) +
@@ -368,7 +368,7 @@ draw_gs_llvm_variant_key_size(unsigned nr_samplers)
 }
 
 
-static INLINE struct draw_sampler_static_state *
+static inline struct draw_sampler_static_state *
 draw_llvm_variant_key_samplers(struct draw_llvm_variant_key *key)
 {
    return (struct draw_sampler_static_state *)
@@ -476,13 +476,13 @@ struct draw_llvm {
 };
 
 
-static INLINE struct llvm_vertex_shader *
+static inline struct llvm_vertex_shader *
 llvm_vertex_shader(struct draw_vertex_shader *vs)
 {
    return (struct llvm_vertex_shader *)vs;
 }
 
-static INLINE struct llvm_geometry_shader *
+static inline struct llvm_geometry_shader *
 llvm_geometry_shader(struct draw_geometry_shader *gs)
 {
    return (struct llvm_geometry_shader *)gs;
diff --git a/src/gallium/auxiliary/draw/draw_pipe.h b/src/gallium/auxiliary/draw/draw_pipe.h
index 35273330d13..e69dcbded0e 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.h
+++ b/src/gallium/auxiliary/draw/draw_pipe.h
@@ -115,7 +115,7 @@ void draw_unfilled_prepare_outputs(struct draw_context *context,
  * \param idx  index into stage's tmp[] array to put the copy (dest)
  * \return  pointer to the copied vertex
  */
-static INLINE struct vertex_header *
+static inline struct vertex_header *
 dup_vert( struct draw_stage *stage,
 	  const struct vertex_header *vert,
 	  unsigned idx )
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 936046ea5f5..85d24b7a6a1 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -511,7 +511,7 @@ bind_aaline_fragment_shader(struct aaline_stage *aaline)
 
 
 
-static INLINE struct aaline_stage *
+static inline struct aaline_stage *
 aaline_stage( struct draw_stage *stage )
 {
    return (struct aaline_stage *) stage;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index 7feb49ae934..3918923296d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -427,7 +427,7 @@ bind_aapoint_fragment_shader(struct aapoint_stage *aapoint)
 
 
 
-static INLINE struct aapoint_stage *
+static inline struct aapoint_stage *
 aapoint_stage( struct draw_stage *stage )
 {
    return (struct aapoint_stage *) stage;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index e1e7dcc6f63..c22758bc702 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -70,12 +70,12 @@ struct clip_stage {
 
 
 /** Cast wrapper */
-static INLINE struct clip_stage *clip_stage( struct draw_stage *stage )
+static inline struct clip_stage *clip_stage( struct draw_stage *stage )
 {
    return (struct clip_stage *)stage;
 }
 
-static INLINE unsigned
+static inline unsigned
 draw_viewport_index(struct draw_context *draw,
                     const struct vertex_header *leading_vertex)
 {
@@ -210,7 +210,7 @@ static void interp( const struct clip_stage *clip,
  * true, otherwise returns false.
  * Triangle is considered null/empty if it's area is qual to zero.
  */
-static INLINE boolean
+static inline boolean
 is_tri_null(struct draw_context *draw, const struct prim_header *header)
 {
    const unsigned pos_attr = draw_current_shader_position_output(draw);
@@ -322,7 +322,7 @@ static void emit_poly( struct draw_stage *stage,
 }
 
 
-static INLINE float
+static inline float
 dot4(const float *a, const float *b)
 {
    return (a[0] * b[0] +
@@ -336,7 +336,7 @@ dot4(const float *a, const float *b)
  * it first checks if the shader provided a clip distance, otherwise
  * it works out the value using the clipvertex
  */
-static INLINE float getclipdist(const struct clip_stage *clipper,
+static inline float getclipdist(const struct clip_stage *clipper,
                                 struct vertex_header *vert,
                                 int plane_idx)
 {
diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index fa344089a8a..fc8293bd128 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -46,12 +46,12 @@ struct cull_stage {
 };
 
 
-static INLINE struct cull_stage *cull_stage( struct draw_stage *stage )
+static inline struct cull_stage *cull_stage( struct draw_stage *stage )
 {
    return (struct cull_stage *)stage;
 }
 
-static INLINE boolean
+static inline boolean
 cull_distance_is_out(float dist)
 {
    return (dist < 0.0f) || util_is_inf_or_nan(dist);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index 59e33b472f4..0ea740861d6 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -47,7 +47,7 @@ struct flat_stage
 };
 
 
-static INLINE struct flat_stage *
+static inline struct flat_stage *
 flat_stage(struct draw_stage *stage)
 {
    return (struct flat_stage *) stage;
@@ -55,7 +55,7 @@ flat_stage(struct draw_stage *stage)
 
 
 /** Copy all the constant attributes from 'src' vertex to 'dst' vertex */
-static INLINE void copy_flats( struct draw_stage *stage,
+static inline void copy_flats( struct draw_stage *stage,
                                struct vertex_header *dst,
                                const struct vertex_header *src )
 {
@@ -70,7 +70,7 @@ static INLINE void copy_flats( struct draw_stage *stage,
 
 
 /** Copy all the color attributes from src vertex to dst0 & dst1 vertices */
-static INLINE void copy_flats2( struct draw_stage *stage,
+static inline void copy_flats2( struct draw_stage *stage,
                                 struct vertex_header *dst0,
                                 struct vertex_header *dst1,
                                 const struct vertex_header *src )
diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
index b25dd21fd4d..5e0d8ce793d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -49,7 +49,7 @@ struct offset_stage {
 
 
 
-static INLINE struct offset_stage *offset_stage( struct draw_stage *stage )
+static inline struct offset_stage *offset_stage( struct draw_stage *stage )
 {
    return (struct offset_stage *) stage;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index 445f195e59c..186b4cb4935 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -462,7 +462,7 @@ bind_pstip_fragment_shader(struct pstip_stage *pstip)
 }
 
 
-static INLINE struct pstip_stage *
+static inline struct pstip_stage *
 pstip_stage( struct draw_stage *stage )
 {
    return (struct pstip_stage *) stage;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 476c011b9a0..381aa41530b 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -53,7 +53,7 @@ struct stipple_stage {
 };
 
 
-static INLINE struct stipple_stage *
+static inline struct stipple_stage *
 stipple_stage(struct draw_stage *stage)
 {
    return (struct stipple_stage *) stage;
@@ -108,7 +108,7 @@ emit_segment(struct draw_stage *stage, struct prim_header *header,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 stipple_test(int counter, ushort pattern, int factor)
 {
    int b = (counter / factor) & 0xf;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index 8148f6b4569..7f958d9b985 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -43,7 +43,7 @@ struct twoside_stage {
 };
 
 
-static INLINE struct twoside_stage *twoside_stage( struct draw_stage *stage )
+static inline struct twoside_stage *twoside_stage( struct draw_stage *stage )
 {
    return (struct twoside_stage *)stage;
 }
@@ -51,7 +51,7 @@ static INLINE struct twoside_stage *twoside_stage( struct draw_stage *stage )
 /**
  * Copy back color(s) to front color(s).
  */
-static INLINE struct vertex_header *
+static inline struct vertex_header *
 copy_bfc( struct twoside_stage *twoside, 
           const struct vertex_header *v,
           unsigned idx )
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index 51fbdb97ae8..8e6435cdbb4 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -53,7 +53,7 @@ struct unfilled_stage {
 };
 
 
-static INLINE struct unfilled_stage *unfilled_stage( struct draw_stage *stage )
+static inline struct unfilled_stage *unfilled_stage( struct draw_stage *stage )
 {
    return (struct unfilled_stage *)stage;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index e0e32dd9bbe..5cc866d7eee 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -85,7 +85,7 @@ struct vbuf_stage {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct vbuf_stage *
+static inline struct vbuf_stage *
 vbuf_stage( struct draw_stage *stage )
 {
    assert(stage);
@@ -97,7 +97,7 @@ static void vbuf_flush_vertices( struct vbuf_stage *vbuf );
 static void vbuf_alloc_vertices( struct vbuf_stage *vbuf );
 
 
-static INLINE boolean 
+static inline boolean 
 overflow( void *map, void *ptr, unsigned bytes, unsigned bufsz )
 {
    unsigned long used = (unsigned long) ((char *)ptr - (char *)map);
@@ -105,7 +105,7 @@ overflow( void *map, void *ptr, unsigned bytes, unsigned bufsz )
 }
 
 
-static INLINE void 
+static inline void 
 check_space( struct vbuf_stage *vbuf, unsigned nr )
 {
    if (vbuf->nr_vertices + nr > vbuf->max_vertices ||
@@ -126,7 +126,7 @@ check_space( struct vbuf_stage *vbuf, unsigned nr )
  * have a couple of slots at the beginning (1-dword header, 4-dword
  * clip pos) that we ignore here.  We only use the vertex->data[] fields.
  */
-static INLINE ushort 
+static inline ushort 
 emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
index 6c57d5c1e3e..38ac11a9adf 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -45,7 +45,7 @@ struct wideline_stage {
 
 
 
-static INLINE struct wideline_stage *wideline_stage( struct draw_stage *stage )
+static inline struct wideline_stage *wideline_stage( struct draw_stage *stage )
 {
    return (struct wideline_stage *)stage;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index 05beba8cd97..348b0e93bbc 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -83,7 +83,7 @@ struct widepoint_stage {
 
 
 
-static INLINE struct widepoint_stage *
+static inline struct widepoint_stage *
 widepoint_stage( struct draw_stage *stage )
 {
    return (struct widepoint_stage *)stage;
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 7b893cb2692..0ad94bb031f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -494,7 +494,7 @@ void draw_update_viewport_flags(struct draw_context *draw);
  * Return index of the given viewport clamping it
  * to be between 0 <= and < PIPE_MAX_VIEWPORTS
  */
-static INLINE unsigned
+static inline unsigned
 draw_clamp_viewport_idx(int idx)
 {
    return ((PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0);
@@ -505,7 +505,7 @@ draw_clamp_viewport_idx(int idx)
  * overflows then it returns the value from
  * the overflow_value variable.
  */
-static INLINE unsigned
+static inline unsigned
 draw_overflow_uadd(unsigned a, unsigned b,
                    unsigned overflow_value)
 {
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 5af845ff938..ffec863ae6f 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -54,7 +54,7 @@ struct fetch_pipeline_middle_end {
 
 
 /** cast wrapper */
-static INLINE struct fetch_pipeline_middle_end *
+static inline struct fetch_pipeline_middle_end *
 fetch_pipeline_middle_end(struct draw_pt_middle_end *middle)
 {
    return (struct fetch_pipeline_middle_end *) middle;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index d17d6959b44..e42c4af0e70 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -60,7 +60,7 @@ struct llvm_middle_end {
 
 
 /** cast wrapper */
-static INLINE struct llvm_middle_end *
+static inline struct llvm_middle_end *
 llvm_middle_end(struct draw_pt_middle_end *middle)
 {
    return (struct llvm_middle_end *) middle;
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 71a7d3918e9..f0d5e0f5656 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -53,7 +53,7 @@ struct pt_post_vs {
                    const struct draw_prim_info *prim_info );
 };
 
-static INLINE void
+static inline void
 initialize_vertex_header(struct vertex_header *header)
 {
    header->clipmask = 0;
@@ -62,7 +62,7 @@ initialize_vertex_header(struct vertex_header *header)
    header->vertex_id = UNDEFINED_VERTEX_ID;
 }
 
-static INLINE float
+static inline float
 dot4(const float *a, const float *b)
 {
    return (a[0]*b[0] +
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
index 91e67c0840d..20de26fd08a 100644
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -65,7 +65,7 @@ draw_so_info(const struct draw_context *draw)
    return state;
 }
 
-static INLINE boolean
+static inline boolean
 draw_has_so(const struct draw_context *draw)
 {
    const struct pipe_stream_output_info *state = draw_so_info(draw);
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
index 8098adea61f..8d448f92a26 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -84,7 +84,7 @@ vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags)
 /**
  * Add a fetch element and add it to the draw elements.
  */
-static INLINE void
+static inline void
 vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch, unsigned ofbias)
 {
    unsigned hash;
@@ -111,7 +111,7 @@ vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch, unsigned ofbias
  * The value is checked for overflows (both integer overflows
  * and the elements array overflow).
  */
-static INLINE unsigned
+static inline unsigned
 vsplit_get_base_idx(struct vsplit_frontend *vsplit,
                     unsigned start, unsigned fetch, unsigned *ofbit)
 {
@@ -137,7 +137,7 @@ vsplit_get_base_idx(struct vsplit_frontend *vsplit,
  * index, plus the element bias, clamped to maximum elememt
  * index if that addition overflows.
  */
-static INLINE unsigned
+static inline unsigned
 vsplit_get_bias_idx(struct vsplit_frontend *vsplit,
                     int idx, int bias, unsigned *ofbias)
 {
@@ -170,7 +170,7 @@ vsplit_get_bias_idx(struct vsplit_frontend *vsplit,
    elt_idx = vsplit_get_base_idx(vsplit, start, fetch, &ofbit);          \
    elt_idx = vsplit_get_bias_idx(vsplit, ofbit ? 0 : DRAW_GET_IDX(elts, elt_idx), elt_bias, &ofbias)
 
-static INLINE void
+static inline void
 vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts,
                        unsigned start, unsigned fetch, int elt_bias)
 {
@@ -179,7 +179,7 @@ vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts,
    vsplit_add_cache(vsplit, elt_idx, ofbias);
 }
 
-static INLINE void
+static inline void
 vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, const ushort *elts,
                        unsigned start, unsigned fetch, int elt_bias)
 {
@@ -193,7 +193,7 @@ vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, const ushort *elts,
  * Add a fetch element and add it to the draw elements.  The fetch element is
  * in full range (uint).
  */
-static INLINE void
+static inline void
 vsplit_add_cache_uint(struct vsplit_frontend *vsplit, const uint *elts,
                       unsigned start, unsigned fetch, int elt_bias)
 {
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
index 0f7a3cdc012..0afabb01398 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
@@ -129,7 +129,7 @@ CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit,
  * When spoken is TRUE, ispoken replaces istart;  When close is TRUE, iclose is
  * appended.
  */
-static INLINE void
+static inline void
 CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit,
                                         unsigned flags,
                                         unsigned istart, unsigned icount,
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index b4178d6a6c5..ee11d2f9276 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -91,13 +91,13 @@ struct vertex_info
    } attrib[PIPE_MAX_SHADER_OUTPUTS];
 };
 
-static INLINE size_t
+static inline size_t
 draw_vinfo_size( const struct vertex_info *a )
 {
    return offsetof(const struct vertex_info, attrib[a->num_attribs]);
 }
 
-static INLINE int
+static inline int
 draw_vinfo_compare( const struct vertex_info *a,
                     const struct vertex_info *b )
 {
@@ -105,7 +105,7 @@ draw_vinfo_compare( const struct vertex_info *a,
    return memcmp( a, b, sizea );
 }
 
-static INLINE void
+static inline void
 draw_vinfo_copy( struct vertex_info *dst,
                  const struct vertex_info *src )
 {
@@ -121,7 +121,7 @@ draw_vinfo_copy( struct vertex_info *dst,
  *                   corresponds to this attribute.
  * \return slot in which the attribute was added
  */
-static INLINE uint
+static inline uint
 draw_emit_vertex_attr(struct vertex_info *vinfo,
                       enum attrib_emit emit, 
                       enum interp_mode interp, /* only used by softpipe??? */
@@ -150,7 +150,7 @@ void draw_dump_emitted_vertex(const struct vertex_info *vinfo,
                               const uint8_t *data);
 
 
-static INLINE enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit)
+static inline enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit)
 {
    switch (emit) {
    case EMIT_OMIT:
@@ -174,7 +174,7 @@ static INLINE enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit
    }
 }
 
-static INLINE unsigned draw_translate_vinfo_size(enum attrib_emit emit)
+static inline unsigned draw_translate_vinfo_size(enum attrib_emit emit)
 {
    switch (emit) {
    case EMIT_OMIT:
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 1d54e7ef298..24b29e70dd9 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -191,12 +191,12 @@ draw_vs_create_variant_generic( struct draw_vertex_shader *vs,
 
 
 
-static INLINE int draw_vs_variant_keysize( const struct draw_vs_variant_key *key )
+static inline int draw_vs_variant_keysize( const struct draw_vs_variant_key *key )
 {
    return 2 * sizeof(int) + key->nr_elements * sizeof(struct draw_variant_element);
 }
 
-static INLINE int draw_vs_variant_key_compare( const struct draw_vs_variant_key *a,
+static inline int draw_vs_variant_key_compare( const struct draw_vs_variant_key *a,
                                          const struct draw_vs_variant_key *b )
 {
    int keysize = draw_vs_variant_keysize(a);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9daa93eec3e..50ae192325b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1135,7 +1135,7 @@ lp_build_div(struct lp_build_context *bld,
  *
  * @sa http://www.stereopsis.com/doubleblend.html
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_lerp_simple(struct lp_build_context *bld,
                      LLVMValueRef x,
                      LLVMValueRef v0,
@@ -1674,7 +1674,7 @@ enum lp_build_round_mode
  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_round_sse41(struct lp_build_context *bld,
                      LLVMValueRef a,
                      enum lp_build_round_mode mode)
@@ -1717,7 +1717,7 @@ lp_build_round_sse41(struct lp_build_context *bld,
       args[2] = LLVMConstInt(i32t, mode, 0);
 
       res = lp_build_intrinsic(builder, intrinsic,
-                               vec_type, args, Elements(args));
+                               vec_type, args, Elements(args), 0);
 
       res = LLVMBuildExtractElement(builder, res, index0, "");
    }
@@ -1761,7 +1761,7 @@ lp_build_round_sse41(struct lp_build_context *bld,
 }
 
 
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
                              LLVMValueRef a)
 {
@@ -1817,7 +1817,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
 
 /*
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_round_altivec(struct lp_build_context *bld,
                        LLVMValueRef a,
                        enum lp_build_round_mode mode)
@@ -1851,7 +1851,7 @@ lp_build_round_altivec(struct lp_build_context *bld,
    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 }
 
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_round_arch(struct lp_build_context *bld,
                     LLVMValueRef a,
                     enum lp_build_round_mode mode)
@@ -1997,6 +1997,12 @@ lp_build_floor(struct lp_build_context *bld,
       LLVMTypeRef int_vec_type = bld->int_vec_type;
       LLVMTypeRef vec_type = bld->vec_type;
 
+      if (type.width != 32) {
+         char intrinsic[32];
+         util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
+         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      }
+
       assert(type.width == 32); /* might want to handle doubles at some point */
 
       inttype = type;
@@ -2066,6 +2072,12 @@ lp_build_ceil(struct lp_build_context *bld,
       LLVMTypeRef int_vec_type = bld->int_vec_type;
       LLVMTypeRef vec_type = bld->vec_type;
 
+      if (type.width != 32) {
+         char intrinsic[32];
+         util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
+         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      }
+
       assert(type.width == 32); /* might want to handle doubles at some point */
 
       inttype = type;
@@ -2427,7 +2439,7 @@ lp_build_sqrt(struct lp_build_context *bld,
  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_rcp_refine(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef rcp_a)
@@ -2512,7 +2524,7 @@ lp_build_rcp(struct lp_build_context *bld,
  *
  * See also Intel 64 and IA-32 Architectures Optimization Manual.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_rsqrt_refine(struct lp_build_context *bld,
                       LLVMValueRef a,
                       LLVMValueRef rsqrt_a)
@@ -3535,7 +3547,7 @@ lp_build_fpstate_get(struct gallivm_state *gallivm)
       lp_build_intrinsic(builder,
                          "llvm.x86.sse.stmxcsr",
                          LLVMVoidTypeInContext(gallivm->context),
-                         &mxcsr_ptr8, 1);
+                         &mxcsr_ptr8, 1, 0);
       return mxcsr_ptr;
    }
    return 0;
@@ -3582,6 +3594,6 @@ lp_build_fpstate_set(struct gallivm_state *gallivm,
       lp_build_intrinsic(builder,
                          "llvm.x86.sse.ldmxcsr",
                          LLVMVoidTypeInContext(gallivm->context),
-                         &mxcsr_ptr, 1);
+                         &mxcsr_ptr, 1, 0);
    }
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index b17c41931f4..a4c3bf0977a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -120,14 +120,14 @@ lp_build_const_mask_aos_swizzled(struct gallivm_state *gallivm,
                                  const unsigned char *swizzle);
 
 
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_const_int32(struct gallivm_state *gallivm, int i)
 {
    return LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), i, 0);
 }
 
 
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_const_float(struct gallivm_state *gallivm, float x)
 {
    return LLVMConstReal(LLVMFloatTypeInContext(gallivm->context), x);
@@ -135,7 +135,7 @@ lp_build_const_float(struct gallivm_state *gallivm, float x)
 
 
 /** Return constant-valued pointer to int */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_const_int_pointer(struct gallivm_state *gallivm, const void *ptr)
 {
    LLVMTypeRef int_type;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index 405e6486f7a..7283e2f162f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -61,6 +61,7 @@ lp_check_alignment(const void *ptr, unsigned alignment)
    return ((uintptr_t)ptr & (alignment - 1)) == 0;
 }
 
+#if (defined(PIPE_OS_WINDOWS) && !defined(PIPE_CC_MSVC)) || defined(PIPE_OS_EMBEDDED)
 
 class raw_debug_ostream :
    public llvm::raw_ostream
@@ -91,6 +92,7 @@ raw_debug_ostream::write_impl(const char *Ptr, size_t Size)
    }
 }
 
+#endif
 
 extern "C" const char *
 lp_get_module_id(LLVMModuleRef module)
@@ -123,7 +125,7 @@ lp_debug_dump_value(LLVMValueRef value)
  * - http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html
  */
 static size_t
-disassemble(const void* func, llvm::raw_ostream & Out)
+disassemble(const void* func)
 {
    const uint8_t *bytes = (const uint8_t *)func;
 
@@ -141,7 +143,8 @@ disassemble(const void* func, llvm::raw_ostream & Out)
    char outline[1024];
 
    if (!D) {
-      Out << "error: couldn't create disassembler for triple " << Triple << "\n";
+      _debug_printf("error: couldn't create disassembler for triple %s\n",
+                    Triple.c_str());
       return 0;
    }
 
@@ -155,13 +158,13 @@ disassemble(const void* func, llvm::raw_ostream & Out)
        * so that between runs.
        */
 
-      Out << llvm::format("%6lu:\t", (unsigned long)pc);
+      _debug_printf("%6lu:\t", (unsigned long)pc);
 
       Size = LLVMDisasmInstruction(D, (uint8_t *)bytes + pc, extent - pc, 0, outline,
                                    sizeof outline);
 
       if (!Size) {
-         Out << "invalid\n";
+         _debug_printf("invalid\n");
          pc += 1;
          break;
       }
@@ -173,10 +176,10 @@ disassemble(const void* func, llvm::raw_ostream & Out)
       if (0) {
          unsigned i;
          for (i = 0; i < Size; ++i) {
-            Out << llvm::format("%02x ", bytes[pc + i]);
+            _debug_printf("%02x ", bytes[pc + i]);
          }
          for (; i < 16; ++i) {
-            Out << "   ";
+            _debug_printf("   ");
          }
       }
 
@@ -184,9 +187,9 @@ disassemble(const void* func, llvm::raw_ostream & Out)
        * Print the instruction.
        */
 
-      Out << outline;
+      _debug_printf("%*s", Size, outline);
 
-      Out << "\n";
+      _debug_printf("\n");
 
       /*
        * Stop disassembling on return statements, if there is no record of a
@@ -206,13 +209,12 @@ disassemble(const void* func, llvm::raw_ostream & Out)
       pc += Size;
 
       if (pc >= extent) {
-         Out << "disassembly larger than " << extent << "bytes, aborting\n";
+         _debug_printf("disassembly larger than %ull bytes, aborting\n", extent);
          break;
       }
    }
 
-   Out << "\n";
-   Out.flush();
+   _debug_printf("\n");
 
    LLVMDisasmDispose(D);
 
@@ -229,9 +231,8 @@ disassemble(const void* func, llvm::raw_ostream & Out)
 
 extern "C" void
 lp_disassemble(LLVMValueRef func, const void *code) {
-   raw_debug_ostream Out;
-   Out << LLVMGetValueName(func) << ":\n";
-   disassemble(code, Out);
+   _debug_printf("%s:\n", LLVMGetValueName(func));
+   disassemble(code);
 }
 
 
@@ -273,7 +274,7 @@ lp_profile(LLVMValueRef func, const void *code)
       unsigned long addr = (uintptr_t)code;
       llvm::raw_fd_ostream Out(perf_asm_fd, false);
       Out << symbol << ":\n";
-      unsigned long size = disassemble(code, Out);
+      unsigned long size = disassemble(code);
       fprintf(perf_map_file, "%lx %lx %s\n", addr, size, symbol);
       fflush(perf_map_file);
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 321e09d56b9..375ba6cb5ff 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -59,7 +59,7 @@ extern unsigned gallivm_debug;
 #endif
 
 
-static INLINE void
+static inline void
 lp_build_name(LLVMValueRef val, const char *format, ...)
 {
 #ifdef DEBUG
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index efe71704c3a..ddf3ad1dfc6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -95,7 +95,7 @@ lp_build_format_swizzle_aos(const struct util_format_description *desc,
 /**
  * Whether the format matches the vector type, apart of swizzles.
  */
-static INLINE boolean
+static inline boolean
 format_matches_type(const struct util_format_description *desc,
                     struct lp_type type)
 {
@@ -146,7 +146,7 @@ format_matches_type(const struct util_format_description *desc,
  *
  * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
                                const struct util_format_description *desc,
                                LLVMValueRef packed)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index 4f5a45c6a3d..fa0e8b656bb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -212,7 +212,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
 }
 
 
-static INLINE void
+static inline void
 yuv_to_rgb_soa(struct gallivm_state *gallivm,
                unsigned n,
                LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 384ea864081..017d0752060 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -106,7 +106,6 @@ enum LLVM_CodeGenOpt_Level {
 static boolean
 create_pass_manager(struct gallivm_state *gallivm)
 {
-   char *td_str;
    assert(!gallivm->passmgr);
    assert(gallivm->target);
 
@@ -122,10 +121,29 @@ create_pass_manager(struct gallivm_state *gallivm)
    // Old versions of LLVM get the DataLayout from the pass manager.
    LLVMAddTargetData(gallivm->target, gallivm->passmgr);
 
-   // New ones from the Module.
-   td_str = LLVMCopyStringRepOfTargetData(gallivm->target);
-   LLVMSetDataLayout(gallivm->module, td_str);
-   free(td_str);
+   /* Setting the module's DataLayout to an empty string will cause the
+    * ExecutionEngine to copy to the DataLayout string from its target
+    * machine to the module.  As of LLVM 3.8 the module and the execution
+    * engine are required to have the same DataLayout.
+    *
+    * TODO: This is just a temporary work-around.  The correct solution is
+    * for gallivm_init_state() to create a TargetMachine and pull the
+    * DataLayout from there.  Currently, the TargetMachine used by llvmpipe
+    * is being implicitly created by the EngineBuilder in
+    * lp_build_create_jit_compiler_for_module()
+    */
+
+#if HAVE_LLVM < 0x0308
+   {
+      char *td_str;
+      // New ones from the Module.
+      td_str = LLVMCopyStringRepOfTargetData(gallivm->target);
+      LLVMSetDataLayout(gallivm->module, td_str);
+      free(td_str);
+   }
+#else
+   LLVMSetDataLayout(gallivm->module, "");
+#endif
 
    if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
       /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 2bf1211bcd7..30f4863ec44 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -81,7 +81,8 @@ lp_build_intrinsic(LLVMBuilderRef builder,
                    const char *name,
                    LLVMTypeRef ret_type,
                    LLVMValueRef *args,
-                   unsigned num_args)
+                   unsigned num_args,
+                   LLVMAttribute attr)
 {
    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
    LLVMValueRef function;
@@ -99,6 +100,9 @@ lp_build_intrinsic(LLVMBuilderRef builder,
       }
 
       function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
+
+      if (attr)
+          LLVMAddFunctionAttr(function, attr);
    }
 
    return LLVMBuildCall(builder, function, args, num_args, "");
@@ -111,7 +115,7 @@ lp_build_intrinsic_unary(LLVMBuilderRef builder,
                          LLVMTypeRef ret_type,
                          LLVMValueRef a)
 {
-   return lp_build_intrinsic(builder, name, ret_type, &a, 1);
+   return lp_build_intrinsic(builder, name, ret_type, &a, 1, 0);
 }
 
 
@@ -127,7 +131,7 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
    args[0] = a;
    args[1] = b;
 
-   return lp_build_intrinsic(builder, name, ret_type, args, 2);
+   return lp_build_intrinsic(builder, name, ret_type, args, 2, 0);
 }
 
 
@@ -242,7 +246,7 @@ lp_build_intrinsic_map(struct gallivm_state *gallivm,
       LLVMValueRef res_elem;
       for(j = 0; j < num_args; ++j)
          arg_elems[j] = LLVMBuildExtractElement(builder, args[j], index, "");
-      res_elem = lp_build_intrinsic(builder, name, ret_elem_type, arg_elems, num_args);
+      res_elem = lp_build_intrinsic(builder, name, ret_elem_type, arg_elems, num_args, 0);
       res = LLVMBuildInsertElement(builder, res, res_elem, index, "");
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index 38c5c29c980..a54b367961a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -59,7 +59,8 @@ lp_build_intrinsic(LLVMBuilderRef builder,
                    const char *name,
                    LLVMTypeRef ret_type,
                    LLVMValueRef *args,
-                   unsigned num_args);
+                   unsigned num_args,
+                   LLVMAttribute attr);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index db503514881..571c615f9f8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -88,7 +88,7 @@
  * actually try to allocate the maximum and run out of memory and crash.  So
  * stick with something reasonable here.
  */
-static INLINE int
+static inline int
 gallivm_get_shader_param(enum pipe_shader_cap param)
 {
    switch(param) {
@@ -100,7 +100,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
       return LP_MAX_TGSI_NESTING;
    case PIPE_SHADER_CAP_MAX_INPUTS:
-      return PIPE_MAX_SHADER_INPUTS;
+      return 32;
    case PIPE_SHADER_CAP_MAX_OUTPUTS:
       return 32;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
@@ -132,6 +132,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 1;
    case PIPE_SHADER_CAP_DOUBLES:
+      return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 80b53e5c3f8..19d30d0d63c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -395,7 +395,7 @@ lp_build_select(struct lp_build_context *bld,
       args[2] = mask;
 
       res = lp_build_intrinsic(builder, intrinsic,
-                               arg_type, args, Elements(args));
+                               arg_type, args, Elements(args), 0);
 
       if (arg_type != bld->vec_type) {
          res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 640b7e0d7e0..eba758da6ae 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -371,7 +371,7 @@ struct lp_build_sample_context
  * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at
  * this time.  Return whether the given mode is supported by that function.
  */
-static INLINE boolean
+static inline boolean
 lp_is_simple_wrap_mode(unsigned mode)
 {
    switch (mode) {
@@ -384,7 +384,7 @@ lp_is_simple_wrap_mode(unsigned mode)
 }
 
 
-static INLINE void
+static inline void
 apply_sampler_swizzle(struct lp_build_sample_context *bld,
                       LLVMValueRef *texel)
 {
@@ -402,7 +402,7 @@ apply_sampler_swizzle(struct lp_build_sample_context *bld,
  * not really dimension as such, this indicates the amount of
  * "normal" texture coords subject to minification, wrapping etc.
  */
-static INLINE unsigned
+static inline unsigned
 texture_dims(enum pipe_texture_target tex)
 {
    switch (tex) {
@@ -424,7 +424,7 @@ texture_dims(enum pipe_texture_target tex)
    }
 }
 
-static INLINE boolean
+static inline boolean
 has_layer_coord(enum pipe_texture_target tex)
 {
    switch (tex) {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index e391d8a4301..c4ae30461cb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -104,7 +104,7 @@ lp_build_tgsi_intrinsic(
    struct lp_build_context * base = &bld_base->base;
    emit_data->output[emit_data->chan] = lp_build_intrinsic(
                base->gallivm->builder, action->intr_name,
-               emit_data->dst_type, emit_data->args, emit_data->arg_count);
+               emit_data->dst_type, emit_data->args, emit_data->arg_count, 0);
 }
 
 LLVMValueRef
@@ -175,13 +175,52 @@ void lp_build_fetch_args(
    unsigned src;
    for (src = 0; src < emit_data->info->num_src; src++) {
       emit_data->args[src] = lp_build_emit_fetch(bld_base, emit_data->inst, src,
-                                               emit_data->chan);
+                                                 emit_data->src_chan);
    }
    emit_data->arg_count = emit_data->info->num_src;
    lp_build_action_set_dst_type(emit_data, bld_base,
 		emit_data->inst->Instruction.Opcode);
 }
 
+/**
+ * with doubles src and dst channels aren't 1:1.
+ * check the src/dst types for the opcode,
+ * 1. if neither is double then src == dst;
+ * 2. if dest is double
+ *     - don't store to y or w
+ *     - if src is double then src == dst.
+ *     - else for f2d, d.xy = s.x
+ *     - else for f2d, d.zw = s.y
+ * 3. if dst is single, src is double
+ *    - map dst x,z to src xy;
+ *    - map dst y,w to src zw;
+ */
+static int get_src_chan_idx(unsigned opcode,
+                            int dst_chan_index)
+{
+   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(opcode);
+   enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(opcode);
+
+   if (dtype != TGSI_TYPE_DOUBLE && stype != TGSI_TYPE_DOUBLE)
+      return dst_chan_index;
+   if (dtype == TGSI_TYPE_DOUBLE) {
+      if (dst_chan_index == 1 || dst_chan_index == 3)
+         return -1;
+      if (stype == TGSI_TYPE_DOUBLE)
+         return dst_chan_index;
+      if (dst_chan_index == 0)
+         return 0;
+      if (dst_chan_index == 2)
+         return 1;
+   } else {
+      if (dst_chan_index == 0 || dst_chan_index == 2)
+         return 0;
+      if (dst_chan_index == 1 || dst_chan_index == 3)
+         return 2;
+   }
+   return -1;
+}
+
 /* XXX: COMMENT
  * It should be assumed that this function ignores writemasks
  */
@@ -197,7 +236,6 @@ lp_build_tgsi_inst_llvm(
    struct lp_build_emit_data emit_data;
    unsigned chan_index;
    LLVMValueRef val;
-
    bld_base->pc++;
 
    if (bld_base->emit_debug) {
@@ -240,7 +278,12 @@ lp_build_tgsi_inst_llvm(
    /* Emit the instructions */
    if (info->output_mode == TGSI_OUTPUT_COMPONENTWISE && bld_base->soa) {
       TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
+         int src_index = get_src_chan_idx(inst->Instruction.Opcode, chan_index);
+         /* ignore channels 1/3 in double dst */
+         if (src_index == -1)
+            continue;
          emit_data.chan = chan_index;
+         emit_data.src_chan = src_index;
          if (!action->fetch_args) {
             lp_build_fetch_args(bld_base, &emit_data);
          } else {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 967373ccdae..2ca9c6194b3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -338,6 +338,7 @@ struct lp_build_tgsi_context
    struct lp_build_context uint_bld;
    struct lp_build_context int_bld;
 
+   struct lp_build_context dbl_bld;
    /** This array stores functions that are used to transform TGSI opcodes to
      * LLVM instructions.
      */
@@ -349,6 +350,9 @@ struct lp_build_tgsi_context
 
    struct lp_build_tgsi_action sqrt_action;
 
+   struct lp_build_tgsi_action drsq_action;
+
+   struct lp_build_tgsi_action dsqrt_action;
    const struct tgsi_shader_info *info;
 
    lp_build_emit_fetch_fn emit_fetch_funcs[TGSI_FILE_COUNT];
@@ -558,13 +562,13 @@ struct lp_build_tgsi_aos_context
 
 };
 
-static INLINE struct lp_build_tgsi_soa_context *
+static inline struct lp_build_tgsi_soa_context *
 lp_soa_context(struct lp_build_tgsi_context *bld_base)
 {
    return (struct lp_build_tgsi_soa_context *)bld_base;
 }
 
-static INLINE struct lp_build_tgsi_aos_context *
+static inline struct lp_build_tgsi_aos_context *
 lp_aos_context(struct lp_build_tgsi_context *bld_base)
 {
    return (struct lp_build_tgsi_aos_context *)bld_base;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 9cb42b237b7..0ad78b0ace2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -894,6 +894,125 @@ const struct lp_build_tgsi_action xpd_action = {
    xpd_emit	 /* emit */
 };
 
+/* TGSI_OPCODE_D2F */
+static void
+d2f_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPTrunc(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                       bld_base->base.vec_type, "");
+}
+
+/* TGSI_OPCODE_D2I */
+static void
+d2i_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPToSI(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->base.int_vec_type, "");
+}
+
+/* TGSI_OPCODE_D2U */
+static void
+d2u_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPToUI(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->base.int_vec_type, "");
+}
+
+/* TGSI_OPCODE_F2D */
+static void
+f2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildFPExt(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_U2D */
+static void
+u2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildUIToFP(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_I2D */
+static void
+i2d_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      LLVMBuildSIToFP(bld_base->base.gallivm->builder,
+                      emit_data->args[0],
+                      bld_base->dbl_bld.vec_type, "");
+}
+
+/* TGSI_OPCODE_DMAD */
+static void
+dmad_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef tmp;
+   tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_DMUL,
+                                   emit_data->args[0],
+                                   emit_data->args[1]);
+   emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
+                                       TGSI_OPCODE_DADD, tmp, emit_data->args[2]);
+}
+
+/*.TGSI_OPCODE_DRCP.*/
+static void drcp_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef one;
+   one = lp_build_const_vec(bld_base->dbl_bld.gallivm, bld_base->dbl_bld.type, 1.0f);
+   emit_data->output[emit_data->chan] = LLVMBuildFDiv(
+      bld_base->base.gallivm->builder,
+      one, emit_data->args[0], "");
+}
+
+/* TGSI_OPCODE_DFRAC */
+static void dfrac_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   LLVMValueRef tmp;
+   tmp = lp_build_floor(&bld_base->dbl_bld,
+			emit_data->args[0]);
+   emit_data->output[emit_data->chan] =  LLVMBuildFSub(bld_base->base.gallivm->builder,
+                                                       emit_data->args[0], tmp, "");
+}
+
 void
 lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
 {
@@ -948,6 +1067,25 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
 
    bld_base->op_actions[TGSI_OPCODE_MAX].emit = fmax_emit;
    bld_base->op_actions[TGSI_OPCODE_MIN].emit = fmin_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DADD].emit = add_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMAX].emit = fmax_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMIN].emit = fmin_emit;
+   bld_base->op_actions[TGSI_OPCODE_DMUL].emit = mul_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_D2F].emit = d2f_emit;
+   bld_base->op_actions[TGSI_OPCODE_D2I].emit = d2i_emit;
+   bld_base->op_actions[TGSI_OPCODE_D2U].emit = d2u_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_F2D].emit = f2d_emit;
+   bld_base->op_actions[TGSI_OPCODE_I2D].emit = i2d_emit;
+   bld_base->op_actions[TGSI_OPCODE_U2D].emit = u2d_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DMAD].emit = dmad_emit;
+
+   bld_base->op_actions[TGSI_OPCODE_DRCP].emit = drcp_emit;
+   bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = dfrac_emit;
+
 }
 
 /* CPU Only default actions */
@@ -1792,6 +1930,107 @@ xor_emit_cpu(
                                                      emit_data->args[1]);
 }
 
+/* TGSI_OPCODE_DABS (CPU Only) */
+static void
+dabs_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->dbl_bld,
+                                                       emit_data->args[0]);
+}
+
+/* TGSI_OPCODE_DNEG (CPU Only) */
+static void
+dneg_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->dbl_bld,
+                                                     bld_base->dbl_bld.zero,
+                                                     emit_data->args[0]);
+}
+
+/* TGSI_OPCODE_DSET Helper (CPU Only) */
+static void
+dset_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data,
+   unsigned pipe_func)
+{
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMValueRef cond = lp_build_cmp(&bld_base->dbl_bld, pipe_func,
+                                    emit_data->args[0], emit_data->args[1]);
+   /* arguments were 64 bit but store as 32 bit */
+   cond = LLVMBuildTrunc(builder, cond, bld_base->int_bld.int_vec_type, "");
+   emit_data->output[emit_data->chan] = cond;
+}
+
+/* TGSI_OPCODE_DSEQ (CPU Only) */
+static void
+dseq_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_EQUAL);
+}
+
+/* TGSI_OPCODE_DSGE (CPU Only) */
+static void
+dsge_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_GEQUAL);
+}
+
+/* TGSI_OPCODE_DSLT (CPU Only) */
+static void
+dslt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_LESS);
+}
+
+/* TGSI_OPCODE_DSNE (CPU Only) */
+static void
+dsne_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   dset_emit_cpu(action, bld_base, emit_data, PIPE_FUNC_NOTEQUAL);
+}
+
+/* Double Reciprocal squareroot (CPU Only) */
+static void
+drecip_sqrt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_rsqrt(&bld_base->dbl_bld,
+                                                         emit_data->args[0]);
+}
+
+/* Double Squareroot (CPU Only) */
+static void
+dsqrt_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] = lp_build_sqrt(&bld_base->dbl_bld,
+                                                      emit_data->args[0]);
+}
+
 void
 lp_set_default_actions_cpu(
    struct lp_build_tgsi_context * bld_base)
@@ -1864,4 +2103,14 @@ lp_set_default_actions_cpu(
 
    bld_base->op_actions[TGSI_OPCODE_XOR].emit = xor_emit_cpu;
 
+   bld_base->op_actions[TGSI_OPCODE_DABS].emit = dabs_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DNEG].emit = dneg_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = dseq_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSGE].emit = dsge_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSLT].emit = dslt_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSNE].emit = dsne_emit_cpu;
+
+   bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = drecip_sqrt_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = dsqrt_emit_cpu;
+
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
index fc7fdbdd231..463d44eb450 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.h
@@ -71,6 +71,11 @@ struct lp_build_emit_data {
     */
    unsigned chan;
 
+   /**
+    * This is used to specify the src channel to read from for doubles.
+    */
+   unsigned src_chan;
+
    /** The lp_build_tgsi_action::emit 'executes' the opcode and writes the
     * results to this array.
     */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index 55acea83799..906a1745551 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -462,7 +462,7 @@ analyse_instruction(struct analysis_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 dump_info(const struct tgsi_token *tokens,
           struct lp_tgsi_info *info)
 {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 268379e7d13..fae604e2f9c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -106,7 +106,7 @@ emit_dump_reg(struct gallivm_state *gallivm,
  * Return the context for the current function.
  * (always 'main', if shader doesn't do any function calls)
  */
-static INLINE struct function_ctx *
+static inline struct function_ctx *
 func_ctx(struct lp_exec_mask *mask)
 {
    assert(mask->function_stack_size > 0);
@@ -120,7 +120,7 @@ func_ctx(struct lp_exec_mask *mask)
  * no loop inside the current function, but we were inside
  * a loop inside another function, from which this one was called.
  */
-static INLINE boolean
+static inline boolean
 mask_has_loop(struct lp_exec_mask *mask)
 {
    int i;
@@ -138,7 +138,7 @@ mask_has_loop(struct lp_exec_mask *mask)
  * no switch in the current function, but we were inside
  * a switch inside another function, from which this one was called.
  */
-static INLINE boolean
+static inline boolean
 mask_has_switch(struct lp_exec_mask *mask)
 {
    int i;
@@ -156,7 +156,7 @@ mask_has_switch(struct lp_exec_mask *mask)
  * no conditional in the current function, but we were inside
  * a conditional inside another function, from which this one was called.
  */
-static INLINE boolean
+static inline boolean
 mask_has_cond(struct lp_exec_mask *mask)
 {
    int i;
@@ -947,15 +947,20 @@ static LLVMValueRef
 build_gather(struct lp_build_tgsi_context *bld_base,
              LLVMValueRef base_ptr,
              LLVMValueRef indexes,
-             LLVMValueRef overflow_mask)
+             LLVMValueRef overflow_mask,
+             LLVMValueRef indexes2)
 {
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    struct lp_build_context *bld = &bld_base->base;
-   LLVMValueRef res = bld->undef;
+   LLVMValueRef res;
    unsigned i;
 
+   if (indexes2)
+      res = LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2));
+   else
+      res = bld->undef;
    /*
     * overflow_mask is a vector telling us which channels
     * in the vector overflowed. We use the overflow behavior for
@@ -976,26 +981,47 @@ build_gather(struct lp_build_tgsi_context *bld_base,
        * control flow.
        */
       indexes = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes);
+      if (indexes2)
+         indexes2 = lp_build_select(uint_bld, overflow_mask, uint_bld->zero, indexes2);
    }
 
    /*
     * Loop over elements of index_vec, load scalar value, insert it into 'res'.
     */
-   for (i = 0; i < bld->type.length; i++) {
-      LLVMValueRef ii = lp_build_const_int32(bld->gallivm, i);
-      LLVMValueRef index = LLVMBuildExtractElement(builder,
-                                                   indexes, ii, "");
+   for (i = 0; i < bld->type.length * (indexes2 ? 2 : 1); i++) {
+      LLVMValueRef si, di;
+      LLVMValueRef index;
       LLVMValueRef scalar_ptr, scalar;
 
+      di = lp_build_const_int32(bld->gallivm, i);
+      if (indexes2)
+         si = lp_build_const_int32(bld->gallivm, i >> 1);
+      else
+         si = di;
+
+      if (indexes2 && (i & 1)) {
+         index = LLVMBuildExtractElement(builder,
+                                         indexes2, si, "");
+      } else {
+         index = LLVMBuildExtractElement(builder,
+                                         indexes, si, "");
+      }
       scalar_ptr = LLVMBuildGEP(builder, base_ptr,
                                 &index, 1, "gather_ptr");
       scalar = LLVMBuildLoad(builder, scalar_ptr, "");
 
-      res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
+      res = LLVMBuildInsertElement(builder, res, scalar, di, "");
    }
 
    if (overflow_mask) {
-      res = lp_build_select(bld, overflow_mask, bld->zero, res);
+      if (indexes2) {
+         res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
+         overflow_mask = LLVMBuildSExt(builder, overflow_mask,
+                                       bld_base->dbl_bld.int_vec_type, "");
+         res = lp_build_select(&bld_base->dbl_bld, overflow_mask,
+                               bld_base->dbl_bld.zero, res);
+      } else
+         res = lp_build_select(bld, overflow_mask, bld->zero, res);
    }
 
    return res;
@@ -1139,8 +1165,10 @@ stype_to_fetch(struct lp_build_tgsi_context * bld_base,
    case TGSI_TYPE_SIGNED:
       bld_fetch = &bld_base->int_bld;
       break;
-   case TGSI_TYPE_VOID:
    case TGSI_TYPE_DOUBLE:
+      bld_fetch = &bld_base->dbl_bld;
+      break;
+   case TGSI_TYPE_VOID:
    default:
       assert(0);
       bld_fetch = NULL;
@@ -1216,6 +1244,7 @@ emit_fetch_constant(
          lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
       LLVMValueRef index_vec;  /* index into the const buffer */
       LLVMValueRef overflow_mask;
+      LLVMValueRef index_vec2 = NULL;
 
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
@@ -1235,22 +1264,33 @@ emit_fetch_constant(
       index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
       index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
 
+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMValueRef swizzle_vec2;
+         swizzle_vec2 = lp_build_const_int_vec(gallivm, uint_bld->type, swizzle + 1);
+         index_vec2 = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec2 = lp_build_add(uint_bld, index_vec2, swizzle_vec2);
+      }
       /* Gather values from the constant buffer */
-      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask);
+      res = build_gather(bld_base, consts_ptr, index_vec, overflow_mask, index_vec2);
    }
    else {
       LLVMValueRef index;  /* index into the const buffer */
       LLVMValueRef scalar, scalar_ptr;
-
+      struct lp_build_context *bld_broad = &bld_base->base;
       index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
 
       scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
                                 &index, 1, "");
+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMTypeRef dptr_type = LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0);
+         scalar_ptr = LLVMBuildBitCast(builder, scalar_ptr, dptr_type, "");
+         bld_broad = &bld_base->dbl_bld;
+      }
       scalar = LLVMBuildLoad(builder, scalar_ptr, "");
-      res = lp_build_broadcast_scalar(&bld_base->base, scalar);
+      res = lp_build_broadcast_scalar(bld_broad, scalar);
    }
 
-   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
       struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
       res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
    }
@@ -1258,6 +1298,39 @@ emit_fetch_constant(
    return res;
 }
 
+/**
+ * Fetch double values from two separate channels.
+ * Doubles are stored split across two channels, like xy and zw.
+ * This function creates a set of 16 floats,
+ * extracts the values from the two channels,
+ * puts them in the correct place, then casts to 8 doubles.
+ */
+static LLVMValueRef
+emit_fetch_double(
+   struct lp_build_tgsi_context * bld_base,
+   enum tgsi_opcode_type stype,
+   LLVMValueRef input,
+   LLVMValueRef input2)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef res;
+   struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+   int i;
+   LLVMValueRef shuffles[16];
+   int len = bld_base->base.type.length * 2;
+   assert(len <= 16);
+
+   for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
+      shuffles[i] = lp_build_const_int32(gallivm, i / 2);
+      shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
+   }
+   res = LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), "");
+
+   return LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
+}
+
 static LLVMValueRef
 emit_fetch_immediate(
    struct lp_build_tgsi_context * bld_base,
@@ -1281,7 +1354,7 @@ emit_fetch_immediate(
       if (reg->Register.Indirect) {
          LLVMValueRef indirect_index;
          LLVMValueRef index_vec;  /* index into the immediate register array */
-
+         LLVMValueRef index_vec2 = NULL;
          indirect_index = get_indirect_index(bld,
                                              reg->Register.File,
                                              reg->Register.Index,
@@ -1296,25 +1369,46 @@ emit_fetch_immediate(
                                            indirect_index,
                                            swizzle,
                                            FALSE);
-
+         if (stype == TGSI_TYPE_DOUBLE)
+            index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                              indirect_index,
+                                              swizzle + 1,
+                                              FALSE);
          /* Gather values from the immediate register array */
-         res = build_gather(bld_base, imms_array, index_vec, NULL);
+         res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2);
       } else {
          LLVMValueRef lindex = lp_build_const_int32(gallivm,
                                         reg->Register.Index * 4 + swizzle);
          LLVMValueRef imms_ptr =  LLVMBuildGEP(builder,
                                                 bld->imms_array, &lindex, 1, "");
          res = LLVMBuildLoad(builder, imms_ptr, "");
+
+         if (stype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef lindex1;
+            LLVMValueRef imms_ptr2;
+            LLVMValueRef res2;
+
+            lindex1 = lp_build_const_int32(gallivm,
+                                           reg->Register.Index * 4 + swizzle + 1);
+            imms_ptr2 = LLVMBuildGEP(builder,
+                                      bld->imms_array, &lindex1, 1, "");
+            res2 = LLVMBuildLoad(builder, imms_ptr2, "");
+            res = emit_fetch_double(bld_base, stype, res, res2);
+         }
       }
    }
    else {
       res = bld->immediates[reg->Register.Index][swizzle];
+      if (stype == TGSI_TYPE_DOUBLE)
+         res = emit_fetch_double(bld_base, stype, res, bld->immediates[reg->Register.Index][swizzle + 1]);
    }
 
    if (stype == TGSI_TYPE_UNSIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
    } else if (stype == TGSI_TYPE_SIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
    }
    return res;
 }
@@ -1334,6 +1428,7 @@ emit_fetch_input(
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
       LLVMValueRef index_vec;  /* index into the input reg array */
+      LLVMValueRef index_vec2 = NULL;
       LLVMValueRef inputs_array;
       LLVMTypeRef fptr_type;
 
@@ -1346,23 +1441,43 @@ emit_fetch_input(
                                         indirect_index,
                                         swizzle,
                                         TRUE);
-
+      if (stype == TGSI_TYPE_DOUBLE) {
+         index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                           indirect_index,
+                                           swizzle + 1,
+                                           TRUE);
+      }
       /* cast inputs_array pointer to float* */
       fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
       inputs_array = LLVMBuildBitCast(builder, bld->inputs_array, fptr_type, "");
 
       /* Gather values from the input register array */
-      res = build_gather(bld_base, inputs_array, index_vec, NULL);
+      res = build_gather(bld_base, inputs_array, index_vec, NULL, index_vec2);
    } else {
       if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
          LLVMValueRef lindex = lp_build_const_int32(gallivm,
                                         reg->Register.Index * 4 + swizzle);
-         LLVMValueRef input_ptr =  LLVMBuildGEP(builder,
-                                                bld->inputs_array, &lindex, 1, "");
+         LLVMValueRef input_ptr = LLVMBuildGEP(builder,
+                                               bld->inputs_array, &lindex, 1, "");
+
          res = LLVMBuildLoad(builder, input_ptr, "");
+         if (stype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef lindex1;
+            LLVMValueRef input_ptr2;
+            LLVMValueRef res2;
+
+            lindex1 = lp_build_const_int32(gallivm,
+                                           reg->Register.Index * 4 + swizzle + 1);
+            input_ptr2 = LLVMBuildGEP(builder,
+                                      bld->inputs_array, &lindex1, 1, "");
+            res2 = LLVMBuildLoad(builder, input_ptr2, "");
+            res = emit_fetch_double(bld_base, stype, res, res2);
+         }
       }
       else {
          res = bld->inputs[reg->Register.Index][swizzle];
+         if (stype == TGSI_TYPE_DOUBLE)
+            res = emit_fetch_double(bld_base, stype, res, bld->inputs[reg->Register.Index][swizzle + 1]);
       }
    }
 
@@ -1372,6 +1487,8 @@ emit_fetch_input(
       res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
    } else if (stype == TGSI_TYPE_SIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
    }
 
    return res;
@@ -1413,7 +1530,7 @@ emit_fetch_gs_input(
    } else {
       attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
    }
-   
+
    if (reg->Dimension.Indirect) {
       vertex_index = get_indirect_index(bld,
                                         reg->Register.File,
@@ -1436,6 +1553,8 @@ emit_fetch_gs_input(
       res = LLVMBuildBitCast(builder, res, bld_base->uint_bld.vec_type, "");
    } else if (stype == TGSI_TYPE_SIGNED) {
       res = LLVMBuildBitCast(builder, res, bld_base->int_bld.vec_type, "");
+   } else if (stype == TGSI_TYPE_DOUBLE) {
+      res = LLVMBuildBitCast(builder, res, bld_base->dbl_bld.vec_type, "");
    }
 
    return res;
@@ -1455,7 +1574,7 @@ emit_fetch_temporary(
 
    if (reg->Register.Indirect) {
       LLVMValueRef indirect_index;
-      LLVMValueRef index_vec;  /* index into the temp reg array */
+      LLVMValueRef index_vec, index_vec2 = NULL;  /* index into the temp reg array */
       LLVMValueRef temps_array;
       LLVMTypeRef fptr_type;
 
@@ -1468,21 +1587,35 @@ emit_fetch_temporary(
                                         indirect_index,
                                         swizzle,
                                         TRUE);
+      if (stype == TGSI_TYPE_DOUBLE) {
+               index_vec2 = get_soa_array_offsets(&bld_base->uint_bld,
+                                                  indirect_index,
+                                                  swizzle + 1,
+                                                  TRUE);
+      }
 
       /* cast temps_array pointer to float* */
       fptr_type = LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
       temps_array = LLVMBuildBitCast(builder, bld->temps_array, fptr_type, "");
 
       /* Gather values from the temporary register array */
-      res = build_gather(bld_base, temps_array, index_vec, NULL);
+      res = build_gather(bld_base, temps_array, index_vec, NULL, index_vec2);
    }
    else {
       LLVMValueRef temp_ptr;
       temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
       res = LLVMBuildLoad(builder, temp_ptr, "");
+
+      if (stype == TGSI_TYPE_DOUBLE) {
+         LLVMValueRef temp_ptr2, res2;
+
+         temp_ptr2 = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle + 1);
+         res2 = LLVMBuildLoad(builder, temp_ptr2, "");
+         res = emit_fetch_double(bld_base, stype, res, res2);
+      }
    }
 
-   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED || stype == TGSI_TYPE_DOUBLE) {
       struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
       res = LLVMBuildBitCast(builder, res, bld_fetch->vec_type, "");
    }
@@ -1648,6 +1781,50 @@ emit_fetch_predicate(
    }
 }
 
+/**
+ * store an array of 8 doubles into two arrays of 8 floats
+ * i.e.
+ * value is d0, d1, d2, d3 etc.
+ * each double has high and low pieces x, y
+ * so gets stored into the separate channels as:
+ * chan_ptr = d0.x, d1.x, d2.x, d3.x
+ * chan_ptr2 = d0.y, d1.y, d2.y, d3.y
+ */
+static void
+emit_store_double_chan(struct lp_build_tgsi_context *bld_base,
+                       int dtype,
+                       LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
+                       LLVMValueRef pred,
+                       LLVMValueRef value)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *float_bld = &bld_base->base;
+   int i;
+   LLVMValueRef temp, temp2;
+   LLVMValueRef shuffles[8];
+   LLVMValueRef shuffles2[8];
+
+   for (i = 0; i < bld_base->base.type.length; i++) {
+      shuffles[i] = lp_build_const_int32(gallivm, i * 2);
+      shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1);
+   }
+
+   temp = LLVMBuildShuffleVector(builder, value,
+                                 LLVMGetUndef(LLVMTypeOf(value)),
+                                 LLVMConstVector(shuffles,
+                                                 bld_base->base.type.length),
+                                 "");
+   temp2 = LLVMBuildShuffleVector(builder, value,
+                                  LLVMGetUndef(LLVMTypeOf(value)),
+                                  LLVMConstVector(shuffles2,
+                                                  bld_base->base.type.length),
+                                  "");
+
+   lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp, chan_ptr);
+   lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp2, chan_ptr2);
+}
 
 /**
  * Register store.
@@ -1683,6 +1860,11 @@ emit_store_chan(
    }
 
    if (reg->Register.Indirect) {
+      /*
+       * Currently the mesa/st doesn't generate indirect stores
+       * to doubles, it normally uses MOV to do indirect stores.
+       */
+      assert(dtype != TGSI_TYPE_DOUBLE);
       indirect_index = get_indirect_index(bld,
                                           reg->Register.File,
                                           reg->Register.Index,
@@ -1721,13 +1903,23 @@ emit_store_chan(
       else {
          LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
                                                   chan_index);
-         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
+
+         if (dtype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
+                                                      chan_index + 1);
+            emit_store_double_chan(bld_base, dtype, out_ptr, out_ptr2,
+                                   pred, value);
+         } else
+            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
       }
       break;
 
    case TGSI_FILE_TEMPORARY:
       /* Temporaries are always stored as floats */
-      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      if (dtype != TGSI_TYPE_DOUBLE)
+         value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
+      else
+         value = LLVMBuildBitCast(builder, value,  LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), bld_base->base.type.length * 2), "");
 
       if (reg->Register.Indirect) {
          LLVMValueRef index_vec;  /* indexes into the temp registers */
@@ -1749,7 +1941,16 @@ emit_store_chan(
       else {
          LLVMValueRef temp_ptr;
          temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);
-         lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
+
+         if (dtype == TGSI_TYPE_DOUBLE) {
+            LLVMValueRef temp_ptr2 = lp_get_temp_ptr_soa(bld,
+                                                         reg->Register.Index,
+                                                         chan_index + 1);
+            emit_store_double_chan(bld_base, dtype, temp_ptr, temp_ptr2,
+                                   pred, value);
+         }
+         else
+            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
       }
       break;
 
@@ -1818,13 +2019,16 @@ emit_store(
 {
    unsigned chan_index;
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
-
+   enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
    if(info->num_dst) {
       LLVMValueRef pred[TGSI_NUM_CHANNELS];
 
       emit_fetch_predicate( bld, inst, pred );
 
       TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+
+         if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
+             continue;
          emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
       }
    }
@@ -2823,6 +3027,7 @@ void lp_emit_immediate_soa(
                lp_build_const_vec(gallivm, bld_base->base.type, imm->u[i].Float);
 
       break;
+   case TGSI_IMM_FLOAT64:
    case TGSI_IMM_UINT32:
       for( i = 0; i < size; ++i ) {
          LLVMValueRef tmp = lp_build_const_vec(gallivm, bld_base->uint_bld.type, imm->u[i].Uint);
@@ -2857,8 +3062,7 @@ void lp_emit_immediate_soa(
    } else {
       /* simply copy the immediate values into the next immediates[] slot */
       unsigned i;
-      const uint size = imm->Immediate.NrTokens - 1;
-      assert(size <= 4);
+      assert(imm->Immediate.NrTokens - 1 <= 4);
       assert(bld->num_immediates < LP_MAX_INLINED_IMMEDIATES);
 
       for(i = 0; i < 4; ++i )
@@ -3674,6 +3878,12 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type));
    lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type));
    lp_build_context_init(&bld.elem_bld, gallivm, lp_elem_type(type));
+   {
+      struct lp_type dbl_type;
+      dbl_type = type;
+      dbl_type.width *= 2;
+      lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type);
+   }
    bld.mask = mask;
    bld.inputs = inputs;
    bld.outputs = outputs;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 191cf92d2d1..7fb449fd03f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -173,7 +173,7 @@ struct lp_build_context
  *
  * e.g. With PIPE_FORMAT_R32G32B32A32_FLOAT returns an lp_type with float[4]
  */
-static INLINE void
+static inline void
 lp_type_from_format_desc(struct lp_type* type, const struct util_format_description *format_desc)
 {
    assert(format_desc->is_array);
@@ -189,14 +189,14 @@ lp_type_from_format_desc(struct lp_type* type, const struct util_format_descript
 }
 
 
-static INLINE void
+static inline void
 lp_type_from_format(struct lp_type* type, enum pipe_format format)
 {
    lp_type_from_format_desc(type, util_format_description(format));
 }
 
 
-static INLINE unsigned
+static inline unsigned
 lp_type_width(struct lp_type type)
 {
    return type.width * type.length;
@@ -204,7 +204,7 @@ lp_type_width(struct lp_type type)
 
 
 /** Create scalar float type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_float(unsigned width)
 {
    struct lp_type res_type;
@@ -220,7 +220,7 @@ lp_type_float(unsigned width)
 
 
 /** Create vector of float type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_float_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -236,7 +236,7 @@ lp_type_float_vec(unsigned width, unsigned total_width)
 
 
 /** Create scalar int type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_int(unsigned width)
 {
    struct lp_type res_type;
@@ -251,7 +251,7 @@ lp_type_int(unsigned width)
 
 
 /** Create vector int type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_int_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -266,7 +266,7 @@ lp_type_int_vec(unsigned width, unsigned total_width)
 
 
 /** Create scalar uint type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_uint(unsigned width)
 {
    struct lp_type res_type;
@@ -280,7 +280,7 @@ lp_type_uint(unsigned width)
 
 
 /** Create vector uint type */
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_uint_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -293,7 +293,7 @@ lp_type_uint_vec(unsigned width, unsigned total_width)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_unorm(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -307,7 +307,7 @@ lp_type_unorm(unsigned width, unsigned total_width)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_fixed(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -322,7 +322,7 @@ lp_type_fixed(unsigned width, unsigned total_width)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_type_ufixed(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
@@ -364,7 +364,7 @@ LLVMTypeRef
 lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type);
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_float32_vec4_type(void)
 {
    struct lp_type type;
@@ -380,7 +380,7 @@ lp_float32_vec4_type(void)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_int32_vec4_type(void)
 {
    struct lp_type type;
@@ -396,7 +396,7 @@ lp_int32_vec4_type(void)
 }
 
 
-static INLINE struct lp_type
+static inline struct lp_type
 lp_unorm8_vec4_type(void)
 {
    struct lp_type type;
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 6a124f7d716..95eed2698bc 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -231,18 +231,53 @@ hud_draw_string(struct hud_context *hud, unsigned x, unsigned y,
 }
 
 static void
-number_to_human_readable(uint64_t num, boolean is_in_bytes, char *out)
+number_to_human_readable(uint64_t num, uint64_t max_value,
+                         enum pipe_driver_query_type type, char *out)
 {
    static const char *byte_units[] =
-      {"", " KB", " MB", " GB", " TB", " PB", " EB"};
+      {" B", " KB", " MB", " GB", " TB", " PB", " EB"};
    static const char *metric_units[] =
       {"", " k", " M", " G", " T", " P", " E"};
-   const char **units = is_in_bytes ? byte_units : metric_units;
-   double divisor = is_in_bytes ? 1024 : 1000;
-   int unit = 0;
+   static const char *time_units[] =
+      {" us", " ms", " s"};  /* based on microseconds */
+   static const char *hz_units[] =
+      {" Hz", " KHz", " MHz", " GHz"};
+   static const char *percent_units[] = {"%"};
+
+   const char **units;
+   unsigned max_unit;
+   double divisor = (type == PIPE_DRIVER_QUERY_TYPE_BYTES) ? 1024 : 1000;
+   unsigned unit = 0;
    double d = num;
 
-   while (d > divisor) {
+   switch (type) {
+   case PIPE_DRIVER_QUERY_TYPE_MICROSECONDS:
+      max_unit = ARRAY_SIZE(time_units)-1;
+      units = time_units;
+      break;
+   case PIPE_DRIVER_QUERY_TYPE_PERCENTAGE:
+      max_unit = ARRAY_SIZE(percent_units)-1;
+      units = percent_units;
+      break;
+   case PIPE_DRIVER_QUERY_TYPE_BYTES:
+      max_unit = ARRAY_SIZE(byte_units)-1;
+      units = byte_units;
+      break;
+   case PIPE_DRIVER_QUERY_TYPE_HZ:
+      max_unit = ARRAY_SIZE(hz_units)-1;
+      units = hz_units;
+      break;
+   default:
+      if (max_value == 100) {
+         max_unit = ARRAY_SIZE(percent_units)-1;
+         units = percent_units;
+      } else {
+         max_unit = ARRAY_SIZE(metric_units)-1;
+         units = metric_units;
+      }
+   }
+
+   while (d > divisor && unit < max_unit) {
       d /= divisor;
       unit++;
    }
@@ -300,9 +335,9 @@ hud_pane_accumulate_vertices(struct hud_context *hud,
       unsigned y = pane->inner_y1 + pane->inner_height * (5 - i) / 5 -
                    hud->font.glyph_height / 2;
 
-      number_to_human_readable(pane->max_value * i / 5,
-                               pane->uses_byte_units, str);
-      hud_draw_string(hud, x, y, str);
+      number_to_human_readable(pane->max_value * i / 5, pane->max_value,
+                               pane->type, str);
+      hud_draw_string(hud, x, y, "%s", str);
    }
 
    /* draw info below the pane */
@@ -311,8 +346,8 @@ hud_pane_accumulate_vertices(struct hud_context *hud,
       unsigned x = pane->x1 + 2;
       unsigned y = pane->y2 + 2 + i*hud->font.glyph_height;
 
-      number_to_human_readable(gr->current_value,
-                               pane->uses_byte_units, str);
+      number_to_human_readable(gr->current_value, pane->max_value,
+                               pane->type, str);
       hud_draw_string(hud, x, y, "  %s: %s", gr->name, str);
       i++;
    }
@@ -417,8 +452,8 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_save_blend(cso);
    cso_save_depth_stencil_alpha(cso);
    cso_save_fragment_shader(cso);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_sampler_views(cso);
+   cso_save_fragment_samplers(cso);
    cso_save_rasterizer(cso);
    cso_save_viewport(cso);
    cso_save_stream_outputs(cso);
@@ -547,8 +582,8 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_restore_blend(cso);
    cso_restore_depth_stencil_alpha(cso);
    cso_restore_fragment_shader(cso);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_sampler_views(cso);
+   cso_restore_fragment_samplers(cso);
    cso_restore_rasterizer(cso);
    cso_restore_viewport(cso);
    cso_restore_stream_outputs(cso);
@@ -869,12 +904,16 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
       else if (strcmp(name, "samples-passed") == 0 &&
                has_occlusion_query(hud->pipe->screen)) {
          hud_pipe_query_install(pane, hud->pipe, "samples-passed",
-                                PIPE_QUERY_OCCLUSION_COUNTER, 0, 0, FALSE);
+                                PIPE_QUERY_OCCLUSION_COUNTER, 0, 0,
+                                PIPE_DRIVER_QUERY_TYPE_UINT64,
+                                PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
       }
       else if (strcmp(name, "primitives-generated") == 0 &&
                has_streamout(hud->pipe->screen)) {
          hud_pipe_query_install(pane, hud->pipe, "primitives-generated",
-                                PIPE_QUERY_PRIMITIVES_GENERATED, 0, 0, FALSE);
+                                PIPE_QUERY_PRIMITIVES_GENERATED, 0, 0,
+                                PIPE_DRIVER_QUERY_TYPE_UINT64,
+                                PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
       }
       else {
          boolean processed = FALSE;
@@ -901,7 +940,8 @@ hud_parse_env_var(struct hud_context *hud, const char *env)
             if (i < Elements(pipeline_statistics_names)) {
                hud_pipe_query_install(pane, hud->pipe, name,
                                       PIPE_QUERY_PIPELINE_STATISTICS, i,
-                                      0, FALSE);
+                                      0, PIPE_DRIVER_QUERY_TYPE_UINT64,
+                                      PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE);
                processed = TRUE;
             }
          }
diff --git a/src/gallium/auxiliary/hud/hud_driver_query.c b/src/gallium/auxiliary/hud/hud_driver_query.c
index 603aba7e8cd..f14305ea835 100644
--- a/src/gallium/auxiliary/hud/hud_driver_query.c
+++ b/src/gallium/auxiliary/hud/hud_driver_query.c
@@ -43,6 +43,7 @@ struct query_info {
    struct pipe_context *pipe;
    unsigned query_type;
    unsigned result_index; /* unit depends on query_type */
+   enum pipe_driver_query_result_type result_type;
 
    /* Ring of queries. If a query is busy, we use another slot. */
    struct pipe_query *query[NUM_QUERIES];
@@ -62,7 +63,8 @@ query_new_value(struct hud_graph *gr)
    uint64_t now = os_time_get();
 
    if (info->last_time) {
-      pipe->end_query(pipe, info->query[info->head]);
+      if (info->query[info->head])
+         pipe->end_query(pipe, info->query[info->head]);
 
       /* read query results */
       while (1) {
@@ -70,7 +72,7 @@ query_new_value(struct hud_graph *gr)
          union pipe_query_result result;
          uint64_t *res64 = (uint64_t *)&result;
 
-         if (pipe->get_query_result(pipe, query, FALSE, &result)) {
+         if (query && pipe->get_query_result(pipe, query, FALSE, &result)) {
             info->results_cumulative += res64[info->result_index];
             info->num_results++;
 
@@ -88,7 +90,8 @@ query_new_value(struct hud_graph *gr)
                        "gallium_hud: all queries are busy after %i frames, "
                        "can't add another query\n",
                        NUM_QUERIES);
-               pipe->destroy_query(pipe, info->query[info->head]);
+               if (info->query[info->head])
+                  pipe->destroy_query(pipe, info->query[info->head]);
                info->query[info->head] =
                      pipe->create_query(pipe, info->query_type, 0);
             }
@@ -106,22 +109,33 @@ query_new_value(struct hud_graph *gr)
       }
 
       if (info->num_results && info->last_time + gr->pane->period <= now) {
-         /* compute the average value across all frames */
-         hud_graph_add_value(gr, info->results_cumulative / info->num_results);
+         uint64_t value;
+
+         switch (info->result_type) {
+         default:
+         case PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE:
+            value = info->results_cumulative / info->num_results;
+            break;
+         case PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE:
+            value = info->results_cumulative;
+            break;
+         }
+
+         hud_graph_add_value(gr, value);
 
          info->last_time = now;
          info->results_cumulative = 0;
          info->num_results = 0;
       }
-
-      pipe->begin_query(pipe, info->query[info->head]);
    }
    else {
       /* initialize */
       info->last_time = now;
       info->query[info->head] = pipe->create_query(pipe, info->query_type, 0);
-      pipe->begin_query(pipe, info->query[info->head]);
    }
+
+   if (info->query[info->head])
+      pipe->begin_query(pipe, info->query[info->head]);
 }
 
 static void
@@ -148,7 +162,8 @@ void
 hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
                        const char *name, unsigned query_type,
                        unsigned result_index,
-                       uint64_t max_value, boolean uses_byte_units)
+                       uint64_t max_value, enum pipe_driver_query_type type,
+                       enum pipe_driver_query_result_type result_type)
 {
    struct hud_graph *gr;
    struct query_info *info;
@@ -172,12 +187,12 @@ hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    info->pipe = pipe;
    info->query_type = query_type;
    info->result_index = result_index;
+   info->result_type = result_type;
 
    hud_pane_add_graph(pane, gr);
    if (pane->max_value < max_value)
       hud_pane_set_max_value(pane, max_value);
-   if (uses_byte_units)
-      pane->uses_byte_units = TRUE;
+   pane->type = type;
 }
 
 boolean
@@ -187,7 +202,6 @@ hud_driver_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    struct pipe_screen *screen = pipe->screen;
    struct pipe_driver_query_info query;
    unsigned num_queries, i;
-   boolean uses_byte_units;
    boolean found = FALSE;
 
    if (!screen->get_driver_query_info)
@@ -206,9 +220,8 @@ hud_driver_query_install(struct hud_pane *pane, struct pipe_context *pipe,
    if (!found)
       return FALSE;
 
-   uses_byte_units = query.type == PIPE_DRIVER_QUERY_TYPE_BYTES;
    hud_pipe_query_install(pane, pipe, query.name, query.query_type, 0,
-                          query.max_value.u64, uses_byte_units);
+                          query.max_value.u64, query.type, query.result_type);
 
    return TRUE;
 }
diff --git a/src/gallium/auxiliary/hud/hud_private.h b/src/gallium/auxiliary/hud/hud_private.h
index 632926b87f5..01caf7b8b2c 100644
--- a/src/gallium/auxiliary/hud/hud_private.h
+++ b/src/gallium/auxiliary/hud/hud_private.h
@@ -66,7 +66,7 @@ struct hud_pane {
    uint64_t ceiling;
    unsigned dyn_ceil_last_ran;
    boolean dyn_ceiling;
-   boolean uses_byte_units;
+   enum pipe_driver_query_type type;
    uint64_t period; /* in microseconds */
 
    struct list_head graph_list;
@@ -89,7 +89,9 @@ void hud_cpu_graph_install(struct hud_pane *pane, unsigned cpu_index);
 void hud_pipe_query_install(struct hud_pane *pane, struct pipe_context *pipe,
                             const char *name, unsigned query_type,
                             unsigned result_index,
-                            uint64_t max_value, boolean uses_byte_units);
+                            uint64_t max_value,
+                            enum pipe_driver_query_type type,
+                            enum pipe_driver_query_result_type result_type);
 boolean hud_driver_query_install(struct hud_pane *pane,
                                  struct pipe_context *pipe, const char *name);
 
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 061f39ac6f3..93dfb803389 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -184,7 +184,8 @@ ttn_emit_declaration(struct ttn_compile *c)
          c->samp_types[decl->Range.First + i] = type;
       }
    } else {
-      nir_variable *var;
+      bool is_array = (array_size > 1);
+
       assert(file == TGSI_FILE_INPUT ||
              file == TGSI_FILE_OUTPUT ||
              file == TGSI_FILE_CONSTANT);
@@ -193,76 +194,99 @@ ttn_emit_declaration(struct ttn_compile *c)
       if ((file == TGSI_FILE_CONSTANT) && decl->Declaration.Dimension)
          return;
 
-      var = rzalloc(b->shader, nir_variable);
-      var->data.driver_location = decl->Range.First;
-
-      var->type = glsl_vec4_type();
-      if (array_size > 1)
-         var->type = glsl_array_type(var->type, array_size);
-
-      switch (file) {
-      case TGSI_FILE_INPUT:
-         var->data.read_only = true;
-         var->data.mode = nir_var_shader_in;
-         var->name = ralloc_asprintf(var, "in_%d", decl->Range.First);
-
-         /* We should probably translate to a VERT_ATTRIB_* or VARYING_SLOT_*
-          * instead, but nothing in NIR core is looking at the value
-          * currently, and this is less change to drivers.
-          */
-         var->data.location = decl->Semantic.Name;
-         var->data.index = decl->Semantic.Index;
-
-         /* We definitely need to translate the interpolation field, because
-          * nir_print will decode it.
-          */
-         switch (decl->Interp.Interpolate) {
-         case TGSI_INTERPOLATE_CONSTANT:
-            var->data.interpolation = INTERP_QUALIFIER_FLAT;
-            break;
-         case TGSI_INTERPOLATE_LINEAR:
-            var->data.interpolation = INTERP_QUALIFIER_NOPERSPECTIVE;
-            break;
-         case TGSI_INTERPOLATE_PERSPECTIVE:
-            var->data.interpolation = INTERP_QUALIFIER_SMOOTH;
-            break;
-         }
-
-         exec_list_push_tail(&b->shader->inputs, &var->node);
-         break;
-      case TGSI_FILE_OUTPUT: {
-         /* Since we can't load from outputs in the IR, we make temporaries
-          * for the outputs and emit stores to the real outputs at the end of
-          * the shader.
-          */
-         nir_register *reg = nir_local_reg_create(b->impl);
-         reg->num_components = 4;
-         if (array_size > 1)
-            reg->num_array_elems = array_size;
-
-         var->data.mode = nir_var_shader_out;
-         var->name = ralloc_asprintf(var, "out_%d", decl->Range.First);
-
-         var->data.location = decl->Semantic.Name;
-         var->data.index = decl->Semantic.Index;
-
-         for (i = 0; i < array_size; i++) {
-            c->output_regs[decl->Range.First + i].offset = i;
-            c->output_regs[decl->Range.First + i].reg = reg;
-         }
-
-         exec_list_push_tail(&b->shader->outputs, &var->node);
+      if ((file == TGSI_FILE_INPUT) || (file == TGSI_FILE_OUTPUT)) {
+         is_array = (is_array && decl->Declaration.Array &&
+                     (decl->Array.ArrayID != 0));
       }
-         break;
-      case TGSI_FILE_CONSTANT:
-         var->data.mode = nir_var_uniform;
-         var->name = ralloc_asprintf(var, "uniform_%d", decl->Range.First);
 
-         exec_list_push_tail(&b->shader->uniforms, &var->node);
-         break;
-      default:
-         unreachable("bad declaration file");
-         return;
+      for (i = 0; i < array_size; i++) {
+         unsigned idx = decl->Range.First + i;
+         nir_variable *var = rzalloc(b->shader, nir_variable);
+
+         var->data.driver_location = idx;
+
+         var->type = glsl_vec4_type();
+         if (is_array)
+            var->type = glsl_array_type(var->type, array_size);
+
+         switch (file) {
+         case TGSI_FILE_INPUT:
+            var->data.read_only = true;
+            var->data.mode = nir_var_shader_in;
+            var->name = ralloc_asprintf(var, "in_%d", idx);
+
+            /* We should probably translate to a VERT_ATTRIB_* or VARYING_SLOT_*
+             * instead, but nothing in NIR core is looking at the value
+             * currently, and this is less change to drivers.
+             */
+            var->data.location = decl->Semantic.Name;
+            var->data.index = decl->Semantic.Index;
+
+            /* We definitely need to translate the interpolation field, because
+             * nir_print will decode it.
+             */
+            switch (decl->Interp.Interpolate) {
+            case TGSI_INTERPOLATE_CONSTANT:
+               var->data.interpolation = INTERP_QUALIFIER_FLAT;
+               break;
+            case TGSI_INTERPOLATE_LINEAR:
+               var->data.interpolation = INTERP_QUALIFIER_NOPERSPECTIVE;
+               break;
+            case TGSI_INTERPOLATE_PERSPECTIVE:
+               var->data.interpolation = INTERP_QUALIFIER_SMOOTH;
+               break;
+            }
+
+            exec_list_push_tail(&b->shader->inputs, &var->node);
+            break;
+         case TGSI_FILE_OUTPUT: {
+            /* Since we can't load from outputs in the IR, we make temporaries
+             * for the outputs and emit stores to the real outputs at the end of
+             * the shader.
+             */
+            nir_register *reg = nir_local_reg_create(b->impl);
+            reg->num_components = 4;
+            if (is_array)
+               reg->num_array_elems = array_size;
+
+            var->data.mode = nir_var_shader_out;
+            var->name = ralloc_asprintf(var, "out_%d", idx);
+
+            var->data.location = decl->Semantic.Name;
+            if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
+                decl->Semantic.Index == 0 &&
+                c->scan->properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+               var->data.index = -1;
+            else
+               var->data.index = decl->Semantic.Index;
+
+            if (is_array) {
+               unsigned j;
+               for (j = 0; j < array_size; j++) {
+                  c->output_regs[idx + j].offset = i + j;
+                  c->output_regs[idx + j].reg = reg;
+               }
+            } else {
+               c->output_regs[idx].offset = i;
+               c->output_regs[idx].reg = reg;
+            }
+
+            exec_list_push_tail(&b->shader->outputs, &var->node);
+         }
+            break;
+         case TGSI_FILE_CONSTANT:
+            var->data.mode = nir_var_uniform;
+            var->name = ralloc_asprintf(var, "uniform_%d", idx);
+
+            exec_list_push_tail(&b->shader->uniforms, &var->node);
+            break;
+         default:
+            unreachable("bad declaration file");
+            return;
+         }
+
+         if (is_array)
+            break;
       }
 
    }
@@ -975,6 +999,9 @@ static void
 setup_texture_info(nir_tex_instr *instr, unsigned texture)
 {
    switch (texture) {
+   case TGSI_TEXTURE_BUFFER:
+      instr->sampler_dim = GLSL_SAMPLER_DIM_BUF;
+      break;
    case TGSI_TEXTURE_1D:
       instr->sampler_dim = GLSL_SAMPLER_DIM_1D;
       break;
@@ -1068,6 +1095,11 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       op = nir_texop_txb;
       num_srcs = 2;
       break;
+   case TGSI_OPCODE_TXB2:
+      op = nir_texop_txb;
+      num_srcs = 2;
+      samp = 2;
+      break;
    case TGSI_OPCODE_TXL:
       op = nir_texop_txl;
       num_srcs = 2;
@@ -1078,7 +1110,12 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       samp = 2;
       break;
    case TGSI_OPCODE_TXF:
-      op = nir_texop_txf;
+      if (tgsi_inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
+          tgsi_inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+         op = nir_texop_txf_ms;
+      } else {
+         op = nir_texop_txf;
+      }
       num_srcs = 2;
       break;
    case TGSI_OPCODE_TXD:
@@ -1164,6 +1201,12 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       src_number++;
    }
 
+   if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_TXB2) {
+      instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[1], X));
+      instr->src[src_number].src_type = nir_tex_src_bias;
+      src_number++;
+   }
+
    if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_TXL) {
       instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], W));
       instr->src[src_number].src_type = nir_tex_src_lod;
@@ -1178,7 +1221,10 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
 
    if (tgsi_inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
       instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], W));
-      instr->src[src_number].src_type = nir_tex_src_lod;
+      if (op == nir_texop_txf_ms)
+         instr->src[src_number].src_type = nir_tex_src_ms_index;
+      else
+         instr->src[src_number].src_type = nir_tex_src_lod;
       src_number++;
    }
 
@@ -1472,7 +1518,7 @@ ttn_emit_instruction(struct ttn_compile *c)
       return;
 
    nir_ssa_def *src[TGSI_FULL_MAX_SRC_REGISTERS];
-   for (i = 0; i < TGSI_FULL_MAX_SRC_REGISTERS; i++) {
+   for (i = 0; i < tgsi_inst->Instruction.NumSrcRegs; i++) {
       src[i] = ttn_get_src(c, &tgsi_inst->Src[i]);
    }
    nir_alu_dest dest = ttn_get_dest(c, tgsi_dst);
@@ -1708,9 +1754,11 @@ ttn_add_output_stores(struct ttn_compile *c)
       for (i = 0; i < array_len; i++) {
          nir_intrinsic_instr *store =
             nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+         unsigned loc = var->data.driver_location + i;
          store->num_components = 4;
-         store->const_index[0] = var->data.driver_location + i;
-         store->src[0].reg.reg = c->output_regs[var->data.driver_location].reg;
+         store->const_index[0] = loc;
+         store->src[0].reg.reg = c->output_regs[loc].reg;
+         store->src[0].reg.base_offset = c->output_regs[loc].offset;
          nir_instr_insert_after_cf_list(b->cf_node_list, &store->instr);
       }
    }
diff --git a/src/gallium/auxiliary/os/os_memory_aligned.h b/src/gallium/auxiliary/os/os_memory_aligned.h
index bb15f24ade3..f7d0e3652ed 100644
--- a/src/gallium/auxiliary/os/os_memory_aligned.h
+++ b/src/gallium/auxiliary/os/os_memory_aligned.h
@@ -55,7 +55,7 @@ add_overflow_size_t(size_t a, size_t b, size_t *res)
 /**
  * Return memory on given byte alignment
  */
-static INLINE void *
+static inline void *
 os_malloc_aligned(size_t size, size_t alignment)
 {
    char *ptr, *buf;
@@ -87,7 +87,7 @@ os_malloc_aligned(size_t size, size_t alignment)
 /**
  * Free memory returned by align_malloc().
  */
-static INLINE void
+static inline void
 os_free_aligned(void *ptr)
 {
    if (ptr) {
diff --git a/src/gallium/auxiliary/os/os_memory_stdc.h b/src/gallium/auxiliary/os/os_memory_stdc.h
index 806e5363568..c9fde06d8ac 100644
--- a/src/gallium/auxiliary/os/os_memory_stdc.h
+++ b/src/gallium/auxiliary/os/os_memory_stdc.h
@@ -50,7 +50,7 @@
 
 #if defined(HAVE_POSIX_MEMALIGN)
 
-static INLINE void *
+static inline void *
 os_malloc_aligned(size_t size, size_t alignment)
 {
    void *ptr;
diff --git a/src/gallium/auxiliary/os/os_mman.h b/src/gallium/auxiliary/os/os_mman.h
index e892610bdbd..2ae0027c1c2 100644
--- a/src/gallium/auxiliary/os/os_mman.h
+++ b/src/gallium/auxiliary/os/os_mman.h
@@ -58,7 +58,7 @@ extern "C" {
 
 extern void *__mmap2(void *, size_t, int, int, int, size_t);
 
-static INLINE void *os_mmap(void *addr, size_t length, int prot, int flags,
+static inline void *os_mmap(void *addr, size_t length, int prot, int flags,
                             int fd, loff_t offset)
 {
    /* offset must be aligned to 4096 (not necessarily the page size) */
@@ -78,7 +78,7 @@ static INLINE void *os_mmap(void *addr, size_t length, int prot, int flags,
 #  define os_mmap(addr, length, prot, flags, fd, offset) \
              mmap(addr, length, prot, flags, fd, offset)
 
-static INLINE int os_munmap(void *addr, size_t length)
+static inline int os_munmap(void *addr, size_t length)
 {
    /* Copied from configure code generated by AC_SYS_LARGEFILE */
 #define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + \
diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h
index e9da8954885..be8adcc6cf2 100644
--- a/src/gallium/auxiliary/os/os_thread.h
+++ b/src/gallium/auxiliary/os/os_thread.h
@@ -54,7 +54,7 @@ typedef thrd_t pipe_thread;
 #define PIPE_THREAD_ROUTINE( name, param ) \
    int name( void *param )
 
-static INLINE pipe_thread pipe_thread_create( PIPE_THREAD_ROUTINE((*routine), ), void *param )
+static inline pipe_thread pipe_thread_create( PIPE_THREAD_ROUTINE((*routine), ), void *param )
 {
    pipe_thread thread;
 #ifdef HAVE_PTHREAD
@@ -75,17 +75,17 @@ static INLINE pipe_thread pipe_thread_create( PIPE_THREAD_ROUTINE((*routine), ),
    return thread;
 }
 
-static INLINE int pipe_thread_wait( pipe_thread thread )
+static inline int pipe_thread_wait( pipe_thread thread )
 {
    return thrd_join( thread, NULL );
 }
 
-static INLINE int pipe_thread_destroy( pipe_thread thread )
+static inline int pipe_thread_destroy( pipe_thread thread )
 {
    return thrd_detach( thread );
 }
 
-static INLINE void pipe_thread_setname( const char *name )
+static inline void pipe_thread_setname( const char *name )
 {
 #if defined(HAVE_PTHREAD)
 #  if defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
@@ -145,17 +145,17 @@ typedef cnd_t pipe_condvar;
 
 typedef pthread_barrier_t pipe_barrier;
 
-static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
+static inline void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
 {
    pthread_barrier_init(barrier, NULL, count);
 }
 
-static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
+static inline void pipe_barrier_destroy(pipe_barrier *barrier)
 {
    pthread_barrier_destroy(barrier);
 }
 
-static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
+static inline void pipe_barrier_wait(pipe_barrier *barrier)
 {
    pthread_barrier_wait(barrier);
 }
@@ -171,7 +171,7 @@ typedef struct {
    pipe_condvar condvar;
 } pipe_barrier;
 
-static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
+static inline void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
 {
    barrier->count = count;
    barrier->waiters = 0;
@@ -180,14 +180,14 @@ static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
    pipe_condvar_init(barrier->condvar);
 }
 
-static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
+static inline void pipe_barrier_destroy(pipe_barrier *barrier)
 {
    assert(barrier->waiters == 0);
    pipe_mutex_destroy(barrier->mutex);
    pipe_condvar_destroy(barrier->condvar);
 }
 
-static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
+static inline void pipe_barrier_wait(pipe_barrier *barrier)
 {
    pipe_mutex_lock(barrier->mutex);
 
@@ -225,7 +225,7 @@ typedef struct
 } pipe_semaphore;
 
 
-static INLINE void
+static inline void
 pipe_semaphore_init(pipe_semaphore *sema, int init_val)
 {
    pipe_mutex_init(sema->mutex);
@@ -233,7 +233,7 @@ pipe_semaphore_init(pipe_semaphore *sema, int init_val)
    sema->counter = init_val;
 }
 
-static INLINE void
+static inline void
 pipe_semaphore_destroy(pipe_semaphore *sema)
 {
    pipe_mutex_destroy(sema->mutex);
@@ -241,7 +241,7 @@ pipe_semaphore_destroy(pipe_semaphore *sema)
 }
 
 /** Signal/increment semaphore counter */
-static INLINE void
+static inline void
 pipe_semaphore_signal(pipe_semaphore *sema)
 {
    pipe_mutex_lock(sema->mutex);
@@ -251,7 +251,7 @@ pipe_semaphore_signal(pipe_semaphore *sema)
 }
 
 /** Wait for semaphore counter to be greater than zero */
-static INLINE void
+static inline void
 pipe_semaphore_wait(pipe_semaphore *sema)
 {
    pipe_mutex_lock(sema->mutex);
@@ -277,7 +277,7 @@ typedef struct {
 #define PIPE_TSD_INIT_MAGIC 0xff8adc98
 
 
-static INLINE void
+static inline void
 pipe_tsd_init(pipe_tsd *tsd)
 {
    if (tss_create(&tsd->key, NULL/*free*/) != 0) {
@@ -286,7 +286,7 @@ pipe_tsd_init(pipe_tsd *tsd)
    tsd->initMagic = PIPE_TSD_INIT_MAGIC;
 }
 
-static INLINE void *
+static inline void *
 pipe_tsd_get(pipe_tsd *tsd)
 {
    if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
@@ -295,7 +295,7 @@ pipe_tsd_get(pipe_tsd *tsd)
    return tss_get(tsd->key);
 }
 
-static INLINE void
+static inline void
 pipe_tsd_set(pipe_tsd *tsd, void *value)
 {
    if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
diff --git a/src/gallium/auxiliary/os/os_time.c b/src/gallium/auxiliary/os/os_time.c
index f7e4ca49c7c..3d2e4167222 100644
--- a/src/gallium/auxiliary/os/os_time.c
+++ b/src/gallium/auxiliary/os/os_time.c
@@ -33,11 +33,13 @@
  */
 
 
-#include "pipe/p_config.h"
+#include "pipe/p_defines.h"
+#include "util/u_atomic.h"
 
 #if defined(PIPE_OS_UNIX)
 #  include <time.h> /* timeval */
 #  include <sys/time.h> /* timeval */
+#  include <sched.h> /* sched_yield */
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  include <windows.h>
 #else
@@ -92,3 +94,78 @@ os_time_sleep(int64_t usecs)
 }
 
 #endif
+
+
+int64_t
+os_time_get_absolute_timeout(uint64_t timeout)
+{
+   int64_t time, abs_timeout;
+
+   /* Also check for the type upper bound. */
+   if (timeout == PIPE_TIMEOUT_INFINITE || timeout > INT64_MAX)
+      return PIPE_TIMEOUT_INFINITE;
+
+   time = os_time_get_nano();
+   abs_timeout = time + (int64_t)timeout;
+
+   /* Check for overflow. */
+   if (abs_timeout < time)
+      return PIPE_TIMEOUT_INFINITE;
+
+   return abs_timeout;
+}
+
+
+bool
+os_wait_until_zero(volatile int *var, uint64_t timeout)
+{
+   if (!p_atomic_read(var))
+      return true;
+
+   if (!timeout)
+      return false;
+
+   if (timeout == PIPE_TIMEOUT_INFINITE) {
+      while (p_atomic_read(var)) {
+#if defined(PIPE_OS_UNIX)
+         sched_yield();
+#endif
+      }
+      return true;
+   }
+   else {
+      int64_t start_time = os_time_get_nano();
+      int64_t end_time = start_time + timeout;
+
+      while (p_atomic_read(var)) {
+         if (os_time_timeout(start_time, end_time, os_time_get_nano()))
+            return false;
+
+#if defined(PIPE_OS_UNIX)
+         sched_yield();
+#endif
+      }
+      return true;
+   }
+}
+
+
+bool
+os_wait_until_zero_abs_timeout(volatile int *var, int64_t timeout)
+{
+   if (!p_atomic_read(var))
+      return true;
+
+   if (timeout == PIPE_TIMEOUT_INFINITE)
+      return os_wait_until_zero(var, PIPE_TIMEOUT_INFINITE);
+
+   while (p_atomic_read(var)) {
+      if (os_time_get_nano() >= timeout)
+         return false;
+
+#if defined(PIPE_OS_UNIX)
+      sched_yield();
+#endif
+   }
+   return true;
+}
diff --git a/src/gallium/auxiliary/os/os_time.h b/src/gallium/auxiliary/os/os_time.h
index 4fab03cc671..9312e028809 100644
--- a/src/gallium/auxiliary/os/os_time.h
+++ b/src/gallium/auxiliary/os/os_time.h
@@ -45,7 +45,7 @@
 #include "pipe/p_compiler.h"
 
 
-#ifdef	__cplusplus
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -60,9 +60,10 @@ os_time_get_nano(void);
 /*
  * Get the current time in microseconds from an unknown base.
  */
-static INLINE int64_t
-os_time_get(void) {
-    return os_time_get_nano() / 1000;
+static inline int64_t
+os_time_get(void)
+{
+   return os_time_get_nano() / 1000;
 }
 
 
@@ -82,19 +83,56 @@ os_time_sleep(int64_t usecs);
  *
  * Returns true if the current time has elapsed beyond the specified interval.
  */
-static INLINE boolean
+static inline boolean
 os_time_timeout(int64_t start,
                 int64_t end,
                 int64_t curr)
 {
-   if(start <= end)
+   if (start <= end)
       return !(start <= curr && curr < end);
    else
       return !((start <= curr) || (curr < end));
 }
 
 
-#ifdef	__cplusplus
+/**
+ * Convert a relative timeout in nanoseconds into an absolute timeout,
+ * in other words, it returns current time + timeout.
+ * os_time_get_nano() must be monotonic.
+ * PIPE_TIMEOUT_INFINITE is passed through unchanged. If the calculation
+ * overflows, PIPE_TIMEOUT_INFINITE is returned.
+ */
+int64_t
+os_time_get_absolute_timeout(uint64_t timeout);
+
+
+/**
+ * Wait until the variable at the given memory location is zero.
+ *
+ * \param var           variable
+ * \param timeout       timeout in ns, can be anything from 0 (no wait) to
+ *                      PIPE_TIME_INFINITE (wait forever)
+ * \return     true if the variable is zero
+ */
+bool
+os_wait_until_zero(volatile int *var, uint64_t timeout);
+
+
+/**
+ * Wait until the variable at the given memory location is zero.
+ * The timeout is the absolute time when the waiting should stop. If it is
+ * less than or equal to the current time, it only returns the status and
+ * doesn't wait. PIPE_TIME_INFINITE waits forever. This requires that
+ * os_time_get_nano is monotonic.
+ *
+ * \param var       variable
+ * \param timeout   the time in ns when the waiting should stop
+ * \return     true if the variable is zero
+ */
+bool
+os_wait_until_zero_abs_timeout(volatile int *var, int64_t timeout);
+
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/src/gallium/auxiliary/pipe-loader/Makefile.am b/src/gallium/auxiliary/pipe-loader/Makefile.am
index cb6035d85c9..8c837996539 100644
--- a/src/gallium/auxiliary/pipe-loader/Makefile.am
+++ b/src/gallium/auxiliary/pipe-loader/Makefile.am
@@ -1,37 +1,28 @@
 include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
 
-AM_CPPFLAGS = $(DEFINES) \
-	$(VISIBILITY_CFLAGS) \
-	-I$(top_srcdir)/include \
-	-I$(top_srcdir)/src \
+# XXX: check if we need the gallium/winsys include
+AM_CFLAGS = \
 	-I$(top_srcdir)/src/loader \
-	-I$(top_srcdir)/src/gallium/include \
-	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/gallium/winsys
+	-I$(top_srcdir)/src/gallium/winsys \
+	$(GALLIUM_PIPE_LOADER_DEFINES) \
+	$(GALLIUM_CFLAGS) \
+	$(VISIBILITY_CFLAGS)
 
 noinst_LTLIBRARIES = libpipe_loader.la
-noinst_LTLIBRARIES += libpipe_loader_client.la
+
+libpipe_loader_la_SOURCES = \
+	$(COMMON_SOURCES)
 
 if HAVE_DRM_LOADER_GALLIUM
-AM_CFLAGS = $(LIBDRM_CFLAGS)
+AM_CFLAGS += \
+	$(LIBDRM_CFLAGS)
 
-COMMON_SOURCES += $(DRM_SOURCES)
+libpipe_loader_la_SOURCES += \
+	$(DRM_SOURCES)
 
-COMMON_LIBADD = \
+libpipe_loader_la_LIBADD = \
 	$(top_builddir)/src/loader/libloader.la
 
 endif
 
-libpipe_loader_la_CFLAGS  = \
-	$(GALLIUM_PIPE_LOADER_DEFINES) \
-	$(AM_CFLAGS) $(AM_CPPFLAGS)
-libpipe_loader_la_SOURCES = $(COMMON_SOURCES)
-libpipe_loader_la_LIBADD  = $(COMMON_LIBADD) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
-
-libpipe_loader_client_la_CFLAGS  = \
-	$(GALLIUM_PIPE_LOADER_CLIENT_DEFINES) \
-	$(AM_CFLAGS) $(AM_CPPFLAGS)
-libpipe_loader_client_la_SOURCES = $(COMMON_SOURCES)
-libpipe_loader_client_la_LIBADD  = $(COMMON_LIBADD) \
-	$(GALLIUM_PIPE_LOADER_CLIENT_LIBS)
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
index 9f43f17a6e2..9b8712666bb 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@@ -36,10 +36,6 @@
 #include "pipe/p_compiler.h"
 #include "state_tracker/drm_driver.h"
 
-#ifdef HAVE_PIPE_LOADER_XLIB
-#include <X11/Xlib.h>
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -116,21 +112,6 @@ pipe_loader_configuration(struct pipe_loader_device *dev,
 void
 pipe_loader_release(struct pipe_loader_device **devs, int ndev);
 
-#ifdef HAVE_PIPE_LOADER_XLIB
-
-/**
- * Initialize Xlib for an associated display.
- *
- * This function is platform-specific.
- *
- * \sa pipe_loader_probe
- */
-bool
-pipe_loader_sw_probe_xlib(struct pipe_loader_device **devs, Display *display);
-
-#endif
-
-
 #ifdef HAVE_PIPE_LOADER_DRI
 
 /**
@@ -195,13 +176,9 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev);
  * This function is platform-specific.
  *
  * \sa pipe_loader_probe
- *
- * \param auth_x If true, the pipe-loader will attempt to
- *               authenticate with the X server.
  */
 bool
-pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd,
-                         boolean auth_x);
+pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd);
 
 #endif
 
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index ffeb29906b5..1799df7e4c5 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -35,12 +35,6 @@
 #include <xf86drm.h>
 #include <unistd.h>
 
-#ifdef HAVE_PIPE_LOADER_XCB
-
-#include <xcb/dri2.h>
-
-#endif
-
 #include "loader.h"
 #include "state_tracker/drm_driver.h"
 #include "pipe_loader_priv.h"
@@ -64,78 +58,8 @@ struct pipe_loader_drm_device {
 
 static struct pipe_loader_ops pipe_loader_drm_ops;
 
-#ifdef HAVE_PIPE_LOADER_XCB
-
-static xcb_screen_t *
-get_xcb_screen(xcb_screen_iterator_t iter, int screen)
-{
-    for (; iter.rem; --screen, xcb_screen_next(&iter))
-        if (screen == 0)
-            return iter.data;
-
-    return NULL;
-}
-
-#endif
-
-static void
-pipe_loader_drm_x_auth(int fd)
-{
-#ifdef HAVE_PIPE_LOADER_XCB
-   /* Try authenticate with the X server to give us access to devices that X
-    * is running on. */
-   xcb_connection_t *xcb_conn;
-   const xcb_setup_t *xcb_setup;
-   xcb_screen_iterator_t s;
-   xcb_dri2_connect_cookie_t connect_cookie;
-   xcb_dri2_connect_reply_t *connect;
-   drm_magic_t magic;
-   xcb_dri2_authenticate_cookie_t authenticate_cookie;
-   xcb_dri2_authenticate_reply_t *authenticate;
-   int screen;
-
-   xcb_conn = xcb_connect(NULL, &screen);
-
-   if(!xcb_conn)
-      return;
-
-   xcb_setup = xcb_get_setup(xcb_conn);
-
-  if (!xcb_setup)
-    goto disconnect;
-
-   s = xcb_setup_roots_iterator(xcb_setup);
-   connect_cookie = xcb_dri2_connect_unchecked(xcb_conn,
-                                               get_xcb_screen(s, screen)->root,
-                                               XCB_DRI2_DRIVER_TYPE_DRI);
-   connect = xcb_dri2_connect_reply(xcb_conn, connect_cookie, NULL);
-
-   if (!connect || connect->driver_name_length
-                   + connect->device_name_length == 0) {
-
-      goto disconnect;
-   }
-
-   if (drmGetMagic(fd, &magic))
-      goto disconnect;
-
-   authenticate_cookie = xcb_dri2_authenticate_unchecked(xcb_conn,
-                                                         s.data->root,
-                                                         magic);
-   authenticate = xcb_dri2_authenticate_reply(xcb_conn,
-                                              authenticate_cookie,
-                                              NULL);
-   FREE(authenticate);
-
-disconnect:
-   xcb_disconnect(xcb_conn);
-
-#endif
-}
-
 bool
-pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd,
-                         boolean auth_x)
+pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd)
 {
    struct pipe_loader_drm_device *ddev = CALLOC_STRUCT(pipe_loader_drm_device);
    int vendor_id, chip_id;
@@ -153,9 +77,6 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd,
    ddev->base.ops = &pipe_loader_drm_ops;
    ddev->fd = fd;
 
-   if (auth_x)
-      pipe_loader_drm_x_auth(fd);
-
    ddev->base.driver_name = loader_get_driver_for_fd(fd, _LOADER_GALLIUM);
    if (!ddev->base.driver_name)
       goto fail;
@@ -168,35 +89,20 @@ pipe_loader_drm_probe_fd(struct pipe_loader_device **dev, int fd,
    return false;
 }
 
-static int
-open_drm_minor(int minor)
-{
-   char path[PATH_MAX];
-   snprintf(path, sizeof(path), DRM_DEV_NAME, DRM_DIR_NAME, minor);
-   return open(path, O_RDWR, 0);
-}
-
 static int
 open_drm_render_node_minor(int minor)
 {
    char path[PATH_MAX];
    snprintf(path, sizeof(path), DRM_RENDER_NODE_DEV_NAME_FORMAT, DRM_DIR_NAME,
             minor);
-   return open(path, O_RDWR, 0);
+   return loader_open_device(path);
 }
 
 int
 pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
 {
-   int i, k, fd, num_render_node_devs;
-   int j = 0;
+   int i, j, fd;
 
-   struct {
-      unsigned vendor_id;
-      unsigned chip_id;
-   } render_node_devs[DRM_RENDER_NODE_MAX_NODES];
-
-   /* Look for render nodes first */
    for (i = DRM_RENDER_NODE_MIN_MINOR, j = 0;
         i <= DRM_RENDER_NODE_MAX_MINOR; i++) {
       fd = open_drm_render_node_minor(i);
@@ -204,14 +110,11 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
       if (fd < 0)
          continue;
 
-      if (!pipe_loader_drm_probe_fd(&dev, fd, false)) {
+      if (!pipe_loader_drm_probe_fd(&dev, fd)) {
          close(fd);
          continue;
       }
 
-      render_node_devs[j].vendor_id = dev->u.pci.vendor_id;
-      render_node_devs[j].chip_id = dev->u.pci.chip_id;
-
       if (j < ndev) {
          devs[j] = dev;
       } else {
@@ -221,46 +124,6 @@ pipe_loader_drm_probe(struct pipe_loader_device **devs, int ndev)
       j++;
    }
 
-   num_render_node_devs = j;
-
-   /* Next look for drm devices. */
-   for (i = 0; i < DRM_MAX_MINOR; i++) {
-      struct pipe_loader_device *dev;
-      boolean duplicate = FALSE;
-      fd = open_drm_minor(i);
-      if (fd < 0)
-         continue;
-
-      if (!pipe_loader_drm_probe_fd(&dev, fd, true)) {
-         close(fd);
-         continue;
-      }
-
-      /* Check to make sure we aren't already accessing this device via
-       * render nodes.
-       */
-      for (k = 0; k < num_render_node_devs; k++) {
-         if (dev->u.pci.vendor_id == render_node_devs[k].vendor_id &&
-             dev->u.pci.chip_id == render_node_devs[k].chip_id) {
-            close(fd);
-            dev->ops->release(&dev);
-            duplicate = TRUE;
-            break;
-         }
-      }
-
-      if (duplicate)
-         continue;
-
-      if (j < ndev) {
-         devs[j] = dev;
-      } else {
-         dev->ops->release(&dev);
-      }
-
-      j++;
-   }
-
    return j;
 }
 
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
index 3d332645231..6794930193d 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -32,10 +32,6 @@
 #include "sw/dri/dri_sw_winsys.h"
 #include "sw/null/null_sw_winsys.h"
 #include "sw/wrapper/wrapper_sw_winsys.h"
-#ifdef HAVE_PIPE_LOADER_XLIB
-/* Explicitly wrap the header to ease build without X11 headers */
-#include "sw/xlib/xlib_sw_winsys.h"
-#endif
 #include "target-helpers/inline_sw_helper.h"
 #include "state_tracker/drisw_api.h"
 
@@ -53,29 +49,6 @@ static struct sw_winsys *(*backends[])() = {
    null_sw_create
 };
 
-#ifdef HAVE_PIPE_LOADER_XLIB
-bool
-pipe_loader_sw_probe_xlib(struct pipe_loader_device **devs, Display *display)
-{
-   struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
-
-   if (!sdev)
-      return false;
-
-   sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
-   sdev->base.driver_name = "swrast";
-   sdev->base.ops = &pipe_loader_sw_ops;
-   sdev->ws = xlib_create_sw_winsys(display);
-   if (!sdev->ws) {
-      FREE(sdev);
-      return false;
-   }
-   *devs = &sdev->base;
-
-   return true;
-}
-#endif
-
 #ifdef HAVE_PIPE_LOADER_DRI
 bool
 pipe_loader_sw_probe_dri(struct pipe_loader_device **devs, struct drisw_loader_funcs *drisw_lf)
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 03bdce31513..ba48d461d5c 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -158,7 +158,7 @@ struct pb_vtbl
 
 /* Accessor functions for pb->vtbl:
  */
-static INLINE void *
+static inline void *
 pb_map(struct pb_buffer *buf, 
        unsigned flags, void *flush_ctx)
 {
@@ -170,7 +170,7 @@ pb_map(struct pb_buffer *buf,
 }
 
 
-static INLINE void 
+static inline void 
 pb_unmap(struct pb_buffer *buf)
 {
    assert(buf);
@@ -181,7 +181,7 @@ pb_unmap(struct pb_buffer *buf)
 }
 
 
-static INLINE void
+static inline void
 pb_get_base_buffer( struct pb_buffer *buf,
 		    struct pb_buffer **base_buf,
 		    pb_size *offset )
@@ -200,7 +200,7 @@ pb_get_base_buffer( struct pb_buffer *buf,
 }
 
 
-static INLINE enum pipe_error 
+static inline enum pipe_error 
 pb_validate(struct pb_buffer *buf, struct pb_validate *vl, unsigned flags)
 {
    assert(buf);
@@ -211,7 +211,7 @@ pb_validate(struct pb_buffer *buf, struct pb_validate *vl, unsigned flags)
 }
 
 
-static INLINE void 
+static inline void 
 pb_fence(struct pb_buffer *buf, struct pipe_fence_handle *fence)
 {
    assert(buf);
@@ -222,7 +222,7 @@ pb_fence(struct pb_buffer *buf, struct pipe_fence_handle *fence)
 }
 
 
-static INLINE void 
+static inline void 
 pb_destroy(struct pb_buffer *buf)
 {
    assert(buf);
@@ -232,7 +232,7 @@ pb_destroy(struct pb_buffer *buf)
    buf->vtbl->destroy(buf);
 }
 
-static INLINE void
+static inline void
 pb_reference(struct pb_buffer **dst,
              struct pb_buffer *src)
 {
@@ -248,7 +248,7 @@ pb_reference(struct pb_buffer **dst,
  * Utility function to check whether the provided alignment is consistent with
  * the requested or not.
  */
-static INLINE boolean
+static inline boolean
 pb_check_alignment(pb_size requested, pb_size provided)
 {
    if(!requested)
@@ -265,7 +265,7 @@ pb_check_alignment(pb_size requested, pb_size provided)
  * Utility function to check whether the provided alignment is consistent with
  * the requested or not.
  */
-static INLINE boolean
+static inline boolean
 pb_check_usage(unsigned requested, unsigned provided)
 {
    return (requested & provided) == requested ? TRUE : FALSE;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index fc81e11b972..08935b4dec7 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -149,7 +149,7 @@ struct fenced_buffer
 };
 
 
-static INLINE struct fenced_manager *
+static inline struct fenced_manager *
 fenced_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -157,7 +157,7 @@ fenced_manager(struct pb_manager *mgr)
 }
 
 
-static INLINE struct fenced_buffer *
+static inline struct fenced_buffer *
 fenced_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -240,7 +240,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
 }
 
 
-static INLINE void
+static inline void
 fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
                              struct fenced_buffer *fenced_buf)
 {
@@ -265,7 +265,7 @@ fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
  *
  * Reference count should be incremented before calling this function.
  */
-static INLINE void
+static inline void
 fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
                          struct fenced_buffer *fenced_buf)
 {
@@ -289,7 +289,7 @@ fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
  *
  * Returns TRUE if the buffer was detroyed.
  */
-static INLINE boolean
+static inline boolean
 fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
@@ -326,7 +326,7 @@ fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
  * This function will release and re-acquire the mutex, so any copy of mutable
  * state must be discarded after calling it.
  */
-static INLINE enum pipe_error
+static inline enum pipe_error
 fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
@@ -550,7 +550,7 @@ fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
  * This function is a shorthand around pb_manager::create_buffer for
  * fenced_buffer_create_gpu_storage_locked()'s benefit.
  */
-static INLINE boolean
+static inline boolean
 fenced_buffer_try_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
                                             struct fenced_buffer *fenced_buf)
 {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
index bf1a538bf79..b97771457d6 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -49,7 +49,7 @@ struct malloc_buffer
 
 extern const struct pb_vtbl malloc_buffer_vtbl;
 
-static INLINE struct malloc_buffer *
+static inline struct malloc_buffer *
 malloc_buffer(struct pb_buffer *buf)
 {
    assert(buf);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
index 62df2a6b9de..47cbaeb20ac 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
@@ -50,7 +50,7 @@ struct pb_alt_manager
 };
 
 
-static INLINE struct pb_alt_manager *
+static inline struct pb_alt_manager *
 pb_alt_manager(struct pb_manager *mgr)
 {
    assert(mgr);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 5023687ec04..3b35049f679 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -88,7 +88,7 @@ struct pb_cache_manager
 };
 
 
-static INLINE struct pb_cache_buffer *
+static inline struct pb_cache_buffer *
 pb_cache_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -96,7 +96,7 @@ pb_cache_buffer(struct pb_buffer *buf)
 }
 
 
-static INLINE struct pb_cache_manager *
+static inline struct pb_cache_manager *
 pb_cache_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -107,7 +107,7 @@ pb_cache_manager(struct pb_manager *mgr)
 /**
  * Actually destroy the buffer.
  */
-static INLINE void
+static inline void
 _pb_cache_buffer_destroy(struct pb_cache_buffer *buf)
 {
    struct pb_cache_manager *mgr = buf->mgr;
@@ -235,7 +235,7 @@ pb_cache_buffer_vtbl = {
 };
 
 
-static INLINE int
+static inline int
 pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,  
                           pb_size size,
                           const struct pb_desc *desc)
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 6236afb70d1..7ad70f293a6 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -99,7 +99,7 @@ struct pb_debug_manager
 };
 
 
-static INLINE struct pb_debug_buffer *
+static inline struct pb_debug_buffer *
 pb_debug_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -107,7 +107,7 @@ pb_debug_buffer(struct pb_buffer *buf)
 }
 
 
-static INLINE struct pb_debug_manager *
+static inline struct pb_debug_manager *
 pb_debug_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -123,7 +123,7 @@ static const uint8_t random_pattern[32] = {
 };
 
 
-static INLINE void 
+static inline void 
 fill_random_pattern(uint8_t *dst, pb_size size)
 {
    pb_size i = 0;
@@ -134,7 +134,7 @@ fill_random_pattern(uint8_t *dst, pb_size size)
 }
 
 
-static INLINE boolean 
+static inline boolean 
 check_random_pattern(const uint8_t *dst, pb_size size, 
                      pb_size *min_ofs, pb_size *max_ofs) 
 {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index 84eb6edda34..72099ba5850 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -65,7 +65,7 @@ struct mm_pb_manager
 };
 
 
-static INLINE struct mm_pb_manager *
+static inline struct mm_pb_manager *
 mm_pb_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -83,7 +83,7 @@ struct mm_buffer
 };
 
 
-static INLINE struct mm_buffer *
+static inline struct mm_buffer *
 mm_buffer(struct pb_buffer *buf)
 {
    assert(buf);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
index 77e642ada08..c20e2dca02d 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
@@ -70,7 +70,7 @@ struct pb_ondemand_manager
 
 extern const struct pb_vtbl pb_ondemand_buffer_vtbl;
 
-static INLINE struct pb_ondemand_buffer *
+static inline struct pb_ondemand_buffer *
 pb_ondemand_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -80,7 +80,7 @@ pb_ondemand_buffer(struct pb_buffer *buf)
    return (struct pb_ondemand_buffer *)buf;
 }
 
-static INLINE struct pb_ondemand_manager *
+static inline struct pb_ondemand_manager *
 pb_ondemand_manager(struct pb_manager *mgr)
 {
    assert(mgr);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 51525b0f97c..56a5e82ece0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -73,7 +73,7 @@ struct pool_pb_manager
 };
 
 
-static INLINE struct pool_pb_manager *
+static inline struct pool_pb_manager *
 pool_pb_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -93,7 +93,7 @@ struct pool_buffer
 };
 
 
-static INLINE struct pool_buffer *
+static inline struct pool_buffer *
 pool_buffer(struct pb_buffer *buf)
 {
    assert(buf);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 6a62b4f5fdb..aadeaa087f4 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -163,7 +163,7 @@ struct pb_slab_range_manager
 };
 
 
-static INLINE struct pb_slab_buffer *
+static inline struct pb_slab_buffer *
 pb_slab_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -171,7 +171,7 @@ pb_slab_buffer(struct pb_buffer *buf)
 }
 
 
-static INLINE struct pb_slab_manager *
+static inline struct pb_slab_manager *
 pb_slab_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -179,7 +179,7 @@ pb_slab_manager(struct pb_manager *mgr)
 }
 
 
-static INLINE struct pb_slab_range_manager *
+static inline struct pb_slab_range_manager *
 pb_slab_range_manager(struct pb_manager *mgr)
 {
    assert(mgr);
diff --git a/src/gallium/auxiliary/postprocess/pp_colors.c b/src/gallium/auxiliary/postprocess/pp_colors.c
index 247e4df72a4..e6ea0102eac 100644
--- a/src/gallium/auxiliary/postprocess/pp_colors.c
+++ b/src/gallium/auxiliary/postprocess/pp_colors.c
@@ -37,6 +37,7 @@ pp_nocolor(struct pp_queue_t *ppq, struct pipe_resource *in,
 {
 
    struct pp_program *p = ppq->p;
+   const struct pipe_sampler_state *samplers[] = {&p->sampler_point};
 
    pp_filter_setup_in(p, in);
    pp_filter_setup_out(p, out);
@@ -44,8 +45,7 @@ pp_nocolor(struct pp_queue_t *ppq, struct pipe_resource *in,
    pp_filter_set_fb(p);
    pp_filter_misc_state(p);
 
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler_point);
-   cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+   cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
    cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->view);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][0]);
diff --git a/src/gallium/auxiliary/postprocess/pp_mlaa.c b/src/gallium/auxiliary/postprocess/pp_mlaa.c
index 147d14de95d..024a24895c8 100644
--- a/src/gallium/auxiliary/postprocess/pp_mlaa.c
+++ b/src/gallium/auxiliary/postprocess/pp_mlaa.c
@@ -141,8 +141,10 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    p->pipe->clear(p->pipe, PIPE_CLEAR_STENCIL | PIPE_CLEAR_COLOR0,
                   &p->clear_color, 0, 0);
 
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler_point);
-   cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+   {
+      const struct pipe_sampler_state *samplers[] = {&p->sampler_point};
+      cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
+   }
    cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->view);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][1]);    /* offsetvs */
@@ -168,10 +170,11 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
 
    pp_filter_set_clear_fb(p);
 
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler_point);
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->sampler_point);
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 2, &p->sampler);
-   cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+   {
+      const struct pipe_sampler_state *samplers[] =
+         {&p->sampler_point, &p->sampler_point, &p->sampler};
+      cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 3, samplers);
+   }
 
    arr[0] = p->view;
    cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 3, arr);
@@ -199,9 +202,11 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    u_sampler_view_default_template(&v_tmp, in, in->format);
    arr[0] = p->pipe->create_sampler_view(p->pipe, in, &v_tmp);
 
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler_point);
-   cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->sampler_point);
-   cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+   {
+      const struct pipe_sampler_state *samplers[] =
+         {&p->sampler_point, &p->sampler_point};
+      cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 2, samplers);
+   }
 
    arr[1] = p->view;
    cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 2, arr);
diff --git a/src/gallium/auxiliary/postprocess/pp_run.c b/src/gallium/auxiliary/postprocess/pp_run.c
index e76ce854442..caa2062f4cf 100644
--- a/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/src/gallium/auxiliary/postprocess/pp_run.c
@@ -125,8 +125,8 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_save_rasterizer(cso);
    cso_save_sample_mask(cso);
    cso_save_min_samples(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
+   cso_save_fragment_sampler_views(cso);
    cso_save_stencil_ref(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_elements(cso);
@@ -196,8 +196,8 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_restore_rasterizer(cso);
    cso_restore_sample_mask(cso);
    cso_restore_min_samples(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_stencil_ref(cso);
    cso_restore_stream_outputs(cso);
    cso_restore_vertex_elements(cso);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index f9637889187..27ee8f1242a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -510,7 +510,7 @@ void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm )
 /**
  * Immediate group 1 instructions.
  */
-static INLINE void 
+static inline void 
 x86_group1_imm( struct x86_function *p, 
                 unsigned op, struct x86_reg dst, int imm )
 {
@@ -2196,7 +2196,7 @@ void x86_release_func( struct x86_function *p )
 }
 
 
-static INLINE x86_func
+static inline x86_func
 voidptr_to_x86_func(void *v)
 {
    union {
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 498ca824cd1..b44d917cd43 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -136,7 +136,7 @@ enum x86_target
 };
 
 /* make this read a member of x86_function if target != host is desired */
-static INLINE enum x86_target x86_target( struct x86_function* p )
+static inline enum x86_target x86_target( struct x86_function* p )
 {
 #ifdef PIPE_ARCH_X86
    return X86_32;
@@ -147,7 +147,7 @@ static INLINE enum x86_target x86_target( struct x86_function* p )
 #endif
 }
 
-static INLINE unsigned x86_target_caps( struct x86_function* p )
+static inline unsigned x86_target_caps( struct x86_function* p )
 {
    return p->caps;
 }
diff --git a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
index 0648e596549..d353ab81e34 100644
--- a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
@@ -27,7 +27,7 @@
  * TODO: Audit the following *screen_create() - all of
  * them should return the original screen on failuire.
  */
-static INLINE struct pipe_screen *
+static inline struct pipe_screen *
 debug_screen_wrap(struct pipe_screen *screen)
 {
 #if defined(GALLIUM_RBUG)
diff --git a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
index d3c331d224d..08271a760f5 100644
--- a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
@@ -42,6 +42,7 @@
 #if GALLIUM_RADEONSI
 #include "radeon/radeon_winsys.h"
 #include "radeon/drm/radeon_drm_public.h"
+#include "amdgpu/drm/amdgpu_public.h"
 #include "radeonsi/si_public.h"
 #endif
 
@@ -228,7 +229,12 @@ pipe_radeonsi_create_screen(int fd)
 {
    struct radeon_winsys *rw;
 
-   rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+   /* First, try amdgpu. */
+   rw = amdgpu_winsys_create(fd, radeonsi_screen_create);
+
+   if (!rw)
+      rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
 #endif
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index d8cee2b2917..5f46552f6c3 100644
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -20,7 +20,7 @@
 #endif
 
 
-static INLINE struct pipe_screen *
+static inline struct pipe_screen *
 sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 {
    struct pipe_screen *screen = NULL;
@@ -39,7 +39,7 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 }
 
 
-static INLINE struct pipe_screen *
+static inline struct pipe_screen *
 sw_screen_create(struct sw_winsys *winsys)
 {
    const char *default_driver;
@@ -71,7 +71,7 @@ PUBLIC const __DRIextension **__driDriverGetExtensions_swrast(void)
    return galliumsw_driver_extensions;
 }
 
-INLINE struct pipe_screen *
+inline struct pipe_screen *
 drisw_create_screen(struct drisw_loader_funcs *lf)
 {
    struct sw_winsys *winsys = NULL;
@@ -98,7 +98,7 @@ drisw_create_screen(struct drisw_loader_funcs *lf)
 
 extern struct pipe_screen *ninesw_create_screen(struct pipe_screen *screen);
 
-INLINE struct pipe_screen *
+inline struct pipe_screen *
 ninesw_create_screen(struct pipe_screen *pscreen)
 {
    struct sw_winsys *winsys = NULL;
diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
index 0a2e215352b..4f38ba9f919 100644
--- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
@@ -9,7 +9,7 @@
  * Try to wrap a hw screen with a software screen.
  * On failure will return given screen.
  */
-static INLINE struct pipe_screen *
+static inline struct pipe_screen *
 sw_screen_wrap(struct pipe_screen *screen)
 {
 #if defined(GALLIUM_SOFTPIPE) || defined(GALLIUM_LLVMPIPE)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index c80d7a20481..8ceb5b47584 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -48,6 +48,7 @@ struct dump_ctx
    int indent;
    
    uint indentation;
+   FILE *file;
 
    void (*dump_printf)(struct dump_ctx *ctx, const char *format, ...);
 };
@@ -58,7 +59,10 @@ dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
    va_list ap;
    (void)ctx;
    va_start(ap, format);
-   _debug_vprintf(format, ap);
+   if (ctx->file)
+      vfprintf(ctx->file, format, ap);
+   else
+      _debug_vprintf(format, ap);
    va_end(ap);
 }
 
@@ -659,9 +663,7 @@ prolog(
 }
 
 void
-tgsi_dump(
-   const struct tgsi_token *tokens,
-   uint flags )
+tgsi_dump_to_file(const struct tgsi_token *tokens, uint flags, FILE *file)
 {
    struct dump_ctx ctx;
 
@@ -677,10 +679,17 @@ tgsi_dump(
    ctx.indent = 0;
    ctx.dump_printf = dump_ctx_printf;
    ctx.indentation = 0;
+   ctx.file = file;
 
    tgsi_iterate_shader( tokens, &ctx.iter );
 }
 
+void
+tgsi_dump(const struct tgsi_token *tokens, uint flags)
+{
+   tgsi_dump_to_file(tokens, flags, NULL);
+}
+
 struct str_dump_ctx
 {
    struct dump_ctx base;
@@ -733,6 +742,7 @@ tgsi_dump_str(
    ctx.base.indent = 0;
    ctx.base.dump_printf = &str_dump_ctx_printf;
    ctx.base.indentation = 0;
+   ctx.base.file = NULL;
 
    ctx.str = str;
    ctx.str[0] = 0;
@@ -756,6 +766,7 @@ tgsi_dump_instruction_str(
    ctx.base.indent = 0;
    ctx.base.dump_printf = &str_dump_ctx_printf;
    ctx.base.indentation = 0;
+   ctx.base.file = NULL;
 
    ctx.str = str;
    ctx.str[0] = 0;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index bc873a54ae9..7c8f92ee7bc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -32,6 +32,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
+#include <stdio.h>
+
 #if defined __cplusplus
 extern "C" {
 #endif
@@ -43,6 +45,9 @@ tgsi_dump_str(
    char *str,
    size_t size);
 
+void
+tgsi_dump_to_file(const struct tgsi_token *tokens, uint flags, FILE *file);
+
 void
 tgsi_dump(
    const struct tgsi_token *tokens,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 44000ffdb6c..75cd0d53c5a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -735,7 +735,7 @@ static const union tgsi_exec_channel M128Vec = {
  * not lead to crashes, etc.  But when debugging, it's helpful to catch
  * them.
  */
-static INLINE void
+static inline void
 check_inf_or_nan(const union tgsi_exec_channel *chan)
 {
    assert(!util_is_inf_or_nan((chan)->f[0]));
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 208640cfd46..5d56aab2216 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -213,7 +213,7 @@ struct tgsi_sampler
  * input register files, this is the stride between two 1D
  * arrays.
  */
-#define TGSI_EXEC_MAX_INPUT_ATTRIBS PIPE_MAX_SHADER_INPUTS
+#define TGSI_EXEC_MAX_INPUT_ATTRIBS 32
 
 /* The maximum number of bytes per constant buffer.
  */
@@ -386,7 +386,7 @@ boolean
 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst);
 
 
-static INLINE void
+static inline void
 tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
 {
    mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
@@ -395,7 +395,7 @@ tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
 
 
 /** Set execution mask values prior to executing the shader */
-static INLINE void
+static inline void
 tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
                    boolean ch0, boolean ch1, boolean ch2, boolean ch3)
 {
@@ -414,7 +414,7 @@ tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
                                const unsigned *buf_sizes);
 
 
-static INLINE int
+static inline int
 tgsi_exec_get_shader_param(enum pipe_shader_cap param)
 {
    switch(param) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 929531109e5..fb29ea0d53d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -316,7 +316,7 @@ tgsi_get_processor_name( uint processor )
  *
  * MOV and UCMP is special so return VOID
  */
-static INLINE enum tgsi_opcode_type
+static inline enum tgsi_opcode_type
 tgsi_opcode_infer_type( uint opcode )
 {
    switch (opcode) {
@@ -374,7 +374,34 @@ tgsi_opcode_infer_type( uint opcode )
    case TGSI_OPCODE_IMUL_HI:
    case TGSI_OPCODE_IBFE:
    case TGSI_OPCODE_IMSB:
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSNE:
       return TGSI_TYPE_SIGNED;
+   case TGSI_OPCODE_DADD:
+   case TGSI_OPCODE_DABS:
+   case TGSI_OPCODE_DFMA:
+   case TGSI_OPCODE_DNEG:
+   case TGSI_OPCODE_DMUL:
+   case TGSI_OPCODE_DMAX:
+   case TGSI_OPCODE_DMIN:
+   case TGSI_OPCODE_DRCP:
+   case TGSI_OPCODE_DSQRT:
+   case TGSI_OPCODE_DMAD:
+   case TGSI_OPCODE_DLDEXP:
+   case TGSI_OPCODE_DFRACEXP:
+   case TGSI_OPCODE_DFRAC:
+   case TGSI_OPCODE_DRSQ:
+   case TGSI_OPCODE_DTRUNC:
+   case TGSI_OPCODE_DCEIL:
+   case TGSI_OPCODE_DFLR:
+   case TGSI_OPCODE_DROUND:
+   case TGSI_OPCODE_DSSG:
+   case TGSI_OPCODE_F2D:
+   case TGSI_OPCODE_I2D:
+   case TGSI_OPCODE_U2D:
+      return TGSI_TYPE_DOUBLE;
    default:
       return TGSI_TYPE_FLOAT;
    }
@@ -391,6 +418,7 @@ tgsi_opcode_infer_src_type( uint opcode )
    case TGSI_OPCODE_TXF:
    case TGSI_OPCODE_BREAKC:
    case TGSI_OPCODE_U2F:
+   case TGSI_OPCODE_U2D:
    case TGSI_OPCODE_UADD:
    case TGSI_OPCODE_SWITCH:
    case TGSI_OPCODE_CASE:
@@ -400,10 +428,12 @@ tgsi_opcode_infer_src_type( uint opcode )
       return TGSI_TYPE_UNSIGNED;
    case TGSI_OPCODE_IMUL_HI:
    case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_I2D:
       return TGSI_TYPE_SIGNED;
    case TGSI_OPCODE_ARL:
    case TGSI_OPCODE_ARR:
    case TGSI_OPCODE_TXQ_LZ:
+   case TGSI_OPCODE_F2D:
    case TGSI_OPCODE_F2I:
    case TGSI_OPCODE_F2U:
    case TGSI_OPCODE_FSEQ:
@@ -412,6 +442,14 @@ tgsi_opcode_infer_src_type( uint opcode )
    case TGSI_OPCODE_FSNE:
    case TGSI_OPCODE_UCMP:
       return TGSI_TYPE_FLOAT;
+   case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_D2U:
+   case TGSI_OPCODE_D2I:
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSNE:
+      return TGSI_TYPE_DOUBLE;
    default:
       return tgsi_opcode_infer_type(opcode);
    }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 1162b265522..0729b5d2426 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -36,7 +36,7 @@ tgsi_parse_init(
    const struct tgsi_token *tokens )
 {
    ctx->FullHeader.Header = *(struct tgsi_header *) &tokens[0];
-   if( ctx->FullHeader.Header.HeaderSize >= 2 ) {
+   if (ctx->FullHeader.Header.HeaderSize >= 2) {
       ctx->FullHeader.Processor = *(struct tgsi_processor *) &tokens[1];
    }
    else {
@@ -69,7 +69,7 @@ tgsi_parse_end_of_tokens(
  * warnings.  The warnings seem harmless on x86 but on PPC they cause
  * real failures.
  */
-static INLINE void
+static inline void
 copy_token(void *dst, const void *src)
 {
    memcpy(dst, src, 4);
@@ -113,11 +113,11 @@ tgsi_parse_token(
          next_token(ctx, &decl->Dim);
       }
 
-      if( decl->Declaration.Interpolate ) {
+      if (decl->Declaration.Interpolate) {
          next_token( ctx, &decl->Interp );
       }
 
-      if( decl->Declaration.Semantic ) {
+      if (decl->Declaration.Semantic) {
          next_token( ctx, &decl->Semantic );
       }
 
@@ -129,7 +129,7 @@ tgsi_parse_token(
          next_token(ctx, &decl->SamplerView);
       }
 
-      if( decl->Declaration.Array ) {
+      if (decl->Declaration.Array) {
          next_token(ctx, &decl->Array);
       }
 
@@ -190,21 +190,21 @@ tgsi_parse_token(
 
       if (inst->Instruction.Texture) {
          next_token( ctx, &inst->Texture);
-         for( i = 0; i < inst->Texture.NumOffsets; i++ ) {
+         for (i = 0; i < inst->Texture.NumOffsets; i++) {
             next_token( ctx, &inst->TexOffsets[i] );
          }
       }
 
       assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
 
-      for(  i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+      for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
 
          next_token( ctx, &inst->Dst[i].Register );
 
-         if( inst->Dst[i].Register.Indirect )
+         if (inst->Dst[i].Register.Indirect)
             next_token( ctx, &inst->Dst[i].Indirect );
 
-         if( inst->Dst[i].Register.Dimension ) {
+         if (inst->Dst[i].Register.Dimension) {
             next_token( ctx, &inst->Dst[i].Dimension );
 
             /*
@@ -212,21 +212,21 @@ tgsi_parse_token(
              */
             assert( !inst->Dst[i].Dimension.Dimension );
 
-            if( inst->Dst[i].Dimension.Indirect )
+            if (inst->Dst[i].Dimension.Indirect)
                next_token( ctx, &inst->Dst[i].DimIndirect );
          }
       }
 
       assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS );
 
-      for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+      for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 
          next_token( ctx, &inst->Src[i].Register );
 
-         if( inst->Src[i].Register.Indirect )
+         if (inst->Src[i].Register.Indirect)
             next_token( ctx, &inst->Src[i].Indirect );
 
-         if( inst->Src[i].Register.Dimension ) {
+         if (inst->Src[i].Register.Dimension) {
             next_token( ctx, &inst->Src[i].Dimension );
 
             /*
@@ -234,7 +234,7 @@ tgsi_parse_token(
              */
             assert( !inst->Src[i].Dimension.Dimension );
 
-            if( inst->Src[i].Dimension.Indirect )
+            if (inst->Src[i].Dimension.Indirect)
                next_token( ctx, &inst->Src[i].DimIndirect );
          }
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index cd4b2afdb8b..35e1c7cfd62 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -133,7 +133,7 @@ void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx );
 
-static INLINE unsigned
+static inline unsigned
 tgsi_num_tokens(const struct tgsi_token *tokens)
 {
    struct tgsi_header header;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index be4851f5dcb..d14372feb30 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -63,7 +63,7 @@ struct sanity_check_ctx
    boolean print;
 };
 
-static INLINE unsigned
+static inline unsigned
 scan_register_key(const scan_register *reg)
 {
    unsigned key = reg->file;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 6b6a14f55f5..8271ea08177 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -203,7 +203,7 @@ const char *tgsi_immediate_type_names[4] =
 };
 
 
-static INLINE void
+static inline void
 tgsi_strings_check(void)
 {
    STATIC_ASSERT(Elements(tgsi_semantic_names) == TGSI_SEMANTIC_COUNT);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 39d7688ab3b..ceb7c2e0f46 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -94,7 +94,7 @@ struct tgsi_transform_context
 /**
  * Helper for emitting temporary register declarations.
  */
-static INLINE void
+static inline void
 tgsi_transform_temp_decl(struct tgsi_transform_context *ctx,
                          unsigned index)
 {
@@ -108,7 +108,7 @@ tgsi_transform_temp_decl(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_input_decl(struct tgsi_transform_context *ctx,
                           unsigned index,
                           unsigned sem_name, unsigned sem_index,
@@ -130,7 +130,7 @@ tgsi_transform_input_decl(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_sampler_decl(struct tgsi_transform_context *ctx,
                             unsigned index)
 {
@@ -143,7 +143,7 @@ tgsi_transform_sampler_decl(struct tgsi_transform_context *ctx,
    ctx->emit_declaration(ctx, &decl);
 }
 
-static INLINE void
+static inline void
 tgsi_transform_sampler_view_decl(struct tgsi_transform_context *ctx,
                                  unsigned index,
                                  unsigned target,
@@ -165,7 +165,7 @@ tgsi_transform_sampler_view_decl(struct tgsi_transform_context *ctx,
    ctx->emit_declaration(ctx, &decl);
 }
 
-static INLINE void
+static inline void
 tgsi_transform_immediate_decl(struct tgsi_transform_context *ctx,
                               float x, float y, float z, float w)
 {
@@ -186,7 +186,7 @@ tgsi_transform_immediate_decl(struct tgsi_transform_context *ctx,
 /**
  * Helper for emitting 1-operand instructions.
  */
-static INLINE void
+static inline void
 tgsi_transform_op1_inst(struct tgsi_transform_context *ctx,
                         unsigned opcode,
                         unsigned dst_file,
@@ -211,7 +211,7 @@ tgsi_transform_op1_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
                         unsigned opcode,
                         unsigned dst_file,
@@ -240,7 +240,7 @@ tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx,
                             unsigned opcode,
                             unsigned dst_file,
@@ -282,7 +282,7 @@ tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_op2_swz_inst(struct tgsi_transform_context *ctx,
                             unsigned opcode,
                             unsigned dst_file,
@@ -333,7 +333,7 @@ tgsi_transform_op2_swz_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_op3_swz_inst(struct tgsi_transform_context *ctx,
                             unsigned opcode,
                             unsigned dst_file,
@@ -395,7 +395,7 @@ tgsi_transform_op3_swz_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_kill_inst(struct tgsi_transform_context *ctx,
                          unsigned src_file,
                          unsigned src_index,
@@ -419,7 +419,7 @@ tgsi_transform_kill_inst(struct tgsi_transform_context *ctx,
 }
 
 
-static INLINE void
+static inline void
 tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
                            unsigned dst_file,
                            unsigned dst_index,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 201a849ef95..3d213195090 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -1830,7 +1830,7 @@ void ureg_free_tokens( const struct tgsi_token *tokens )
 }
 
 
-static INLINE unsigned
+static inline unsigned
 pipe_shader_from_tgsi_processor(unsigned processor)
 {
    switch (processor) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 1891b068774..0aae550d60a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -140,7 +140,7 @@ ureg_destroy( struct ureg_program * );
 /***********************************************************************
  * Convenience routine:
  */
-static INLINE void *
+static inline void *
 ureg_create_shader_with_so_and_destroy( struct ureg_program *p,
 			struct pipe_context *pipe,
 			const struct pipe_stream_output_info *so )
@@ -150,7 +150,7 @@ ureg_create_shader_with_so_and_destroy( struct ureg_program *p,
    return result;
 }
 
-static INLINE void *
+static inline void *
 ureg_create_shader_and_destroy( struct ureg_program *p,
                                 struct pipe_context *pipe )
 {
@@ -180,7 +180,7 @@ ureg_DECL_fs_input_cyl_centroid(struct ureg_program *,
                        unsigned array_id,
                        unsigned array_size);
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
                        unsigned semantic_name,
                        unsigned semantic_index,
@@ -195,7 +195,7 @@ ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
                                  0, 0, 1);
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_DECL_fs_input(struct ureg_program *ureg,
                    unsigned semantic_name,
                    unsigned semantic_index,
@@ -328,7 +328,7 @@ ureg_DECL_sampler_view(struct ureg_program *,
                        unsigned return_type_w );
 
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm4f( struct ureg_program *ureg,
                        float a, float b,
                        float c, float d)
@@ -341,7 +341,7 @@ ureg_imm4f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 4 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm3f( struct ureg_program *ureg,
                        float a, float b,
                        float c)
@@ -353,7 +353,7 @@ ureg_imm3f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 3 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm2f( struct ureg_program *ureg,
                        float a, float b)
 {
@@ -363,7 +363,7 @@ ureg_imm2f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 2 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm1f( struct ureg_program *ureg,
                        float a)
 {
@@ -372,7 +372,7 @@ ureg_imm1f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 1 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm4u( struct ureg_program *ureg,
             unsigned a, unsigned b,
             unsigned c, unsigned d)
@@ -385,7 +385,7 @@ ureg_imm4u( struct ureg_program *ureg,
    return ureg_DECL_immediate_uint( ureg, v, 4 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm3u( struct ureg_program *ureg,
             unsigned a, unsigned b,
             unsigned c)
@@ -397,7 +397,7 @@ ureg_imm3u( struct ureg_program *ureg,
    return ureg_DECL_immediate_uint( ureg, v, 3 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm2u( struct ureg_program *ureg,
             unsigned a, unsigned b)
 {
@@ -407,14 +407,14 @@ ureg_imm2u( struct ureg_program *ureg,
    return ureg_DECL_immediate_uint( ureg, v, 2 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm1u( struct ureg_program *ureg,
             unsigned a)
 {
    return ureg_DECL_immediate_uint( ureg, &a, 1 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm4i( struct ureg_program *ureg,
             int a, int b,
             int c, int d)
@@ -427,7 +427,7 @@ ureg_imm4i( struct ureg_program *ureg,
    return ureg_DECL_immediate_int( ureg, v, 4 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm3i( struct ureg_program *ureg,
             int a, int b,
             int c)
@@ -439,7 +439,7 @@ ureg_imm3i( struct ureg_program *ureg,
    return ureg_DECL_immediate_int( ureg, v, 3 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm2i( struct ureg_program *ureg,
             int a, int b)
 {
@@ -449,7 +449,7 @@ ureg_imm2i( struct ureg_program *ureg,
    return ureg_DECL_immediate_int( ureg, v, 2 );
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_imm1i( struct ureg_program *ureg,
             int a)
 {
@@ -459,7 +459,7 @@ ureg_imm1i( struct ureg_program *ureg,
 /* Where the destination register has a valid file, but an empty
  * writemask.
  */
-static INLINE boolean
+static inline boolean
 ureg_dst_is_empty( struct ureg_dst dst )
 {
    return dst.File != TGSI_FILE_NULL &&
@@ -573,7 +573,7 @@ ureg_fixup_insn_size(struct ureg_program *ureg,
 
 
 #define OP00( op )                                              \
-static INLINE void ureg_##op( struct ureg_program *ureg )       \
+static inline void ureg_##op( struct ureg_program *ureg )       \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
    struct ureg_emit_insn_result insn;                           \
@@ -592,7 +592,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg )       \
 }
 
 #define OP01( op )                                              \
-static INLINE void ureg_##op( struct ureg_program *ureg,        \
+static inline void ureg_##op( struct ureg_program *ureg,        \
                               struct ureg_src src )             \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
@@ -613,7 +613,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
 }
 
 #define OP00_LBL( op )                                          \
-static INLINE void ureg_##op( struct ureg_program *ureg,        \
+static inline void ureg_##op( struct ureg_program *ureg,        \
                               unsigned *label_token )           \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
@@ -634,7 +634,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
 }
 
 #define OP01_LBL( op )                                          \
-static INLINE void ureg_##op( struct ureg_program *ureg,        \
+static inline void ureg_##op( struct ureg_program *ureg,        \
                               struct ureg_src src,              \
                               unsigned *label_token )          \
 {                                                               \
@@ -657,7 +657,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
 }
 
 #define OP10( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst )                     \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
@@ -681,7 +681,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 
 
 #define OP11( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src )                     \
 {                                                                       \
@@ -706,7 +706,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP12( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1 )                    \
@@ -733,7 +733,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP12_TEX( op )                                                  \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               unsigned target,                          \
                               struct ureg_src src0,                     \
@@ -762,7 +762,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP12_SAMPLE( op )                                               \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1 )                    \
@@ -791,7 +791,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP13( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -820,7 +820,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP13_SAMPLE( op )                                               \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -851,7 +851,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP14_TEX( op )                                                  \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               unsigned target,                          \
                               struct ureg_src src0,                     \
@@ -884,7 +884,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP14_SAMPLE( op )                                               \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -918,7 +918,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 
 
 #define OP14( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -950,7 +950,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 
 
 #define OP15( op )                                                      \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -983,7 +983,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 }
 
 #define OP15_SAMPLE( op )                                               \
-static INLINE void ureg_##op( struct ureg_program *ureg,                \
+static inline void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst,                      \
                               struct ureg_src src0,                     \
                               struct ureg_src src1,                     \
@@ -1026,7 +1026,7 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
 /***********************************************************************
  * Inline helpers for manipulating register structs:
  */
-static INLINE struct ureg_src 
+static inline struct ureg_src 
 ureg_negate( struct ureg_src reg )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1034,7 +1034,7 @@ ureg_negate( struct ureg_src reg )
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_abs( struct ureg_src reg )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1043,7 +1043,7 @@ ureg_abs( struct ureg_src reg )
    return reg;
 }
 
-static INLINE struct ureg_src 
+static inline struct ureg_src 
 ureg_swizzle( struct ureg_src reg, 
               int x, int y, int z, int w )
 {
@@ -1065,13 +1065,13 @@ ureg_swizzle( struct ureg_src reg,
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_scalar( struct ureg_src reg, int x )
 {
    return ureg_swizzle(reg, x, x, x, x);
 }
 
-static INLINE struct ureg_dst 
+static inline struct ureg_dst 
 ureg_writemask( struct ureg_dst reg,
                 unsigned writemask )
 {
@@ -1080,7 +1080,7 @@ ureg_writemask( struct ureg_dst reg,
    return reg;
 }
 
-static INLINE struct ureg_dst 
+static inline struct ureg_dst 
 ureg_saturate( struct ureg_dst reg )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1088,7 +1088,7 @@ ureg_saturate( struct ureg_dst reg )
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_predicate(struct ureg_dst reg,
                boolean negate,
                unsigned swizzle_x,
@@ -1106,7 +1106,7 @@ ureg_predicate(struct ureg_dst reg,
    return reg;
 }
 
-static INLINE struct ureg_dst 
+static inline struct ureg_dst 
 ureg_dst_indirect( struct ureg_dst reg, struct ureg_src addr )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1118,7 +1118,7 @@ ureg_dst_indirect( struct ureg_dst reg, struct ureg_src addr )
    return reg;
 }
 
-static INLINE struct ureg_src 
+static inline struct ureg_src 
 ureg_src_indirect( struct ureg_src reg, struct ureg_src addr )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1130,7 +1130,7 @@ ureg_src_indirect( struct ureg_src reg, struct ureg_src addr )
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_dimension( struct ureg_dst reg, int index )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1140,7 +1140,7 @@ ureg_dst_dimension( struct ureg_dst reg, int index )
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_dimension( struct ureg_src reg, int index )
 {
    assert(reg.File != TGSI_FILE_NULL);
@@ -1150,7 +1150,7 @@ ureg_src_dimension( struct ureg_src reg, int index )
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_dimension_indirect( struct ureg_dst reg, struct ureg_src addr,
                              int index )
 {
@@ -1164,7 +1164,7 @@ ureg_dst_dimension_indirect( struct ureg_dst reg, struct ureg_src addr,
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr,
                              int index )
 {
@@ -1178,21 +1178,21 @@ ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr,
    return reg;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_array_offset(struct ureg_src reg, int offset)
 {
    reg.Index += offset;
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_array_offset( struct ureg_dst reg, int offset )
 {
    reg.Index += offset;
    return reg;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_array_register(unsigned file,
                         unsigned index,
                         unsigned array_id)
@@ -1224,14 +1224,14 @@ ureg_dst_array_register(unsigned file,
    return dst;
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_register(unsigned file,
                   unsigned index)
 {
    return ureg_dst_array_register(file, index, 0);
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst( struct ureg_src src )
 {
    struct ureg_dst dst;
@@ -1265,7 +1265,7 @@ ureg_dst( struct ureg_src src )
    return dst;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_array_register(unsigned file,
                         unsigned index,
                         unsigned array_id)
@@ -1295,14 +1295,14 @@ ureg_src_array_register(unsigned file,
    return src;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_register(unsigned file,
                   unsigned index)
 {
    return ureg_src_array_register(file, index, 0);
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src( struct ureg_dst dst )
 {
    struct ureg_src src;
@@ -1332,7 +1332,7 @@ ureg_src( struct ureg_dst dst )
 
 
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 ureg_dst_undef( void )
 {
    struct ureg_dst dst;
@@ -1362,7 +1362,7 @@ ureg_dst_undef( void )
    return dst;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 ureg_src_undef( void )
 {
    struct ureg_src src;
@@ -1390,13 +1390,13 @@ ureg_src_undef( void )
    return src;
 }
 
-static INLINE boolean
+static inline boolean
 ureg_src_is_undef( struct ureg_src src )
 {
    return src.File == TGSI_FILE_NULL;
 }
 
-static INLINE boolean
+static inline boolean
 ureg_dst_is_undef( struct ureg_dst dst )
 {
    return dst.File == TGSI_FILE_NULL;
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index 7fe8ff8145f..d77561aa7ce 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -130,12 +130,12 @@ struct translate *translate_create( const struct translate_key *key );
 
 boolean translate_is_output_format_supported(enum pipe_format format);
 
-static INLINE int translate_keysize( const struct translate_key *key )
+static inline int translate_keysize( const struct translate_key *key )
 {
    return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element);
 }
 
-static INLINE int translate_key_compare( const struct translate_key *a,
+static inline int translate_key_compare( const struct translate_key *a,
                                          const struct translate_key *b )
 {
    int keysize_a = translate_keysize(a);
@@ -148,7 +148,7 @@ static INLINE int translate_key_compare( const struct translate_key *a,
 }
 
 
-static INLINE void translate_key_sanitize( struct translate_key *a )
+static inline void translate_key_sanitize( struct translate_key *a )
 {
    int keysize = translate_keysize(a);
    char *ptr = (char *)a;
diff --git a/src/gallium/auxiliary/translate/translate_cache.c b/src/gallium/auxiliary/translate/translate_cache.c
index bb8bdcb58c4..2bed02a454b 100644
--- a/src/gallium/auxiliary/translate/translate_cache.c
+++ b/src/gallium/auxiliary/translate/translate_cache.c
@@ -49,7 +49,7 @@ struct translate_cache * translate_cache_create( void )
 }
 
 
-static INLINE void delete_translates(struct translate_cache *cache)
+static inline void delete_translates(struct translate_cache *cache)
 {
    struct cso_hash *hash = cache->hash;
    struct cso_hash_iter iter = cso_hash_first_node(hash);
@@ -70,14 +70,14 @@ void translate_cache_destroy(struct translate_cache *cache)
 }
 
 
-static INLINE unsigned translate_hash_key_size(struct translate_key *key)
+static inline unsigned translate_hash_key_size(struct translate_key *key)
 {
    unsigned size = sizeof(struct translate_key) -
                    sizeof(struct translate_element) * (TRANSLATE_MAX_ATTRIBS - key->nr_elements);
    return size;
 }
 
-static INLINE unsigned create_key(struct translate_key *key)
+static inline unsigned create_key(struct translate_key *key)
 {
    unsigned hash_key;
    unsigned size = translate_hash_key_size(key);
diff --git a/src/gallium/auxiliary/util/u_bitmask.c b/src/gallium/auxiliary/util/u_bitmask.c
index 23c93a3ebcb..b19be29a5a4 100644
--- a/src/gallium/auxiliary/util/u_bitmask.c
+++ b/src/gallium/auxiliary/util/u_bitmask.c
@@ -85,7 +85,7 @@ util_bitmask_create(void)
 /**
  * Resize the bitmask if necessary 
  */
-static INLINE boolean
+static inline boolean
 util_bitmask_resize(struct util_bitmask *bm,
                     unsigned minimum_index)
 {
@@ -131,7 +131,7 @@ util_bitmask_resize(struct util_bitmask *bm,
 /**
  * Lazily update the filled.
  */
-static INLINE void
+static inline void
 util_bitmask_filled_set(struct util_bitmask *bm,
                         unsigned index)
 {
@@ -144,7 +144,7 @@ util_bitmask_filled_set(struct util_bitmask *bm,
    }
 }
 
-static INLINE void
+static inline void
 util_bitmask_filled_unset(struct util_bitmask *bm,
                           unsigned index)
 {
diff --git a/src/gallium/auxiliary/util/u_blend.h b/src/gallium/auxiliary/util/u_blend.h
index 2485c34d418..4f969778972 100644
--- a/src/gallium/auxiliary/util/u_blend.h
+++ b/src/gallium/auxiliary/util/u_blend.h
@@ -9,7 +9,7 @@
  * garbage that's there. Return a blend factor that will take that into
  * account.
  */
-static INLINE int
+static inline int
 util_blend_dst_alpha_to_one(int factor)
 {
    switch (factor) {
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index e3f30557a03..9737c940936 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -158,7 +158,7 @@ util_destroy_blit(struct blit_state *ctx)
 /**
  * Helper function to set the fragment shaders.
  */
-static INLINE void
+static inline void
 set_fragment_shader(struct blit_state *ctx, uint writemask,
                     enum pipe_format format,
                     enum pipe_texture_target pipe_tex)
@@ -194,7 +194,7 @@ set_fragment_shader(struct blit_state *ctx, uint writemask,
 /**
  * Helper function to set the vertex shader.
  */
-static INLINE void
+static inline void
 set_vertex_shader(struct blit_state *ctx)
 {
    /* vertex shader - still required to provide the linkage between
@@ -546,8 +546,8 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_save_rasterizer(ctx->cso);
    cso_save_sample_mask(ctx->cso);
    cso_save_min_samples(ctx->cso);
-   cso_save_samplers(ctx->cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(ctx->cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(ctx->cso);
+   cso_save_fragment_sampler_views(ctx->cso);
    cso_save_stream_outputs(ctx->cso);
    cso_save_viewport(ctx->cso);
    cso_save_framebuffer(ctx->cso);
@@ -572,8 +572,10 @@ util_blit_pixels_tex(struct blit_state *ctx,
    ctx->sampler.normalized_coords = normalized;
    ctx->sampler.min_img_filter = filter;
    ctx->sampler.mag_img_filter = filter;
-   cso_single_sampler(ctx->cso, PIPE_SHADER_FRAGMENT, 0, &ctx->sampler);
-   cso_single_sampler_done(ctx->cso, PIPE_SHADER_FRAGMENT);
+   {
+      const struct pipe_sampler_state *samplers[] = {&ctx->sampler};
+      cso_set_samplers(ctx->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
+   }
 
    /* viewport */
    ctx->viewport.scale[0] = 0.5f * dst->width;
@@ -628,8 +630,8 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_restore_rasterizer(ctx->cso);
    cso_restore_sample_mask(ctx->cso);
    cso_restore_min_samples(ctx->cso);
-   cso_restore_samplers(ctx->cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(ctx->cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(ctx->cso);
+   cso_restore_fragment_sampler_views(ctx->cso);
    cso_restore_viewport(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index b5ef9a23966..85206eab1a7 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -938,7 +938,7 @@ static void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
    }
 }
 
-static INLINE
+static inline
 void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
                                     enum pipe_texture_target target,
                                     unsigned nr_samples)
@@ -976,7 +976,7 @@ void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
    }
 }
 
-static INLINE
+static inline
 void *blitter_get_fs_texfetch_depthstencil(struct blitter_context_priv *ctx,
                                            enum pipe_texture_target target,
                                            unsigned nr_samples)
@@ -1014,7 +1014,7 @@ void *blitter_get_fs_texfetch_depthstencil(struct blitter_context_priv *ctx,
    }
 }
 
-static INLINE
+static inline
 void *blitter_get_fs_texfetch_stencil(struct blitter_context_priv *ctx,
                                       enum pipe_texture_target target,
                                       unsigned nr_samples)
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index 93b0e513bd0..0cd173d6284 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -143,7 +143,7 @@ void util_blitter_cache_all_shaders(struct blitter_context *blitter);
 /**
  * Return the pipe context associated with a blitter context.
  */
-static INLINE
+static inline
 struct pipe_context *util_blitter_get_pipe(struct blitter_context *blitter)
 {
    return blitter->pipe;
@@ -371,77 +371,77 @@ void util_blitter_custom_resolve_color(struct blitter_context *blitter,
  *
  * States not listed here are not affected by util_blitter. */
 
-static INLINE
+static inline
 void util_blitter_save_blend(struct blitter_context *blitter,
                              void *state)
 {
    blitter->saved_blend_state = state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_depth_stencil_alpha(struct blitter_context *blitter,
                                            void *state)
 {
    blitter->saved_dsa_state = state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_vertex_elements(struct blitter_context *blitter,
                                        void *state)
 {
    blitter->saved_velem_state = state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_stencil_ref(struct blitter_context *blitter,
                                    const struct pipe_stencil_ref *state)
 {
    blitter->saved_stencil_ref = *state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_rasterizer(struct blitter_context *blitter,
                                   void *state)
 {
    blitter->saved_rs_state = state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_fragment_shader(struct blitter_context *blitter,
                                        void *fs)
 {
    blitter->saved_fs = fs;
 }
 
-static INLINE
+static inline
 void util_blitter_save_vertex_shader(struct blitter_context *blitter,
                                      void *vs)
 {
    blitter->saved_vs = vs;
 }
 
-static INLINE
+static inline
 void util_blitter_save_geometry_shader(struct blitter_context *blitter,
                                        void *gs)
 {
    blitter->saved_gs = gs;
 }
 
-static INLINE void
+static inline void
 util_blitter_save_tessctrl_shader(struct blitter_context *blitter,
                                   void *sh)
 {
    blitter->saved_tcs = sh;
 }
 
-static INLINE void
+static inline void
 util_blitter_save_tesseval_shader(struct blitter_context *blitter,
                                   void *sh)
 {
    blitter->saved_tes = sh;
 }
 
-static INLINE
+static inline
 void util_blitter_save_framebuffer(struct blitter_context *blitter,
                                    const struct pipe_framebuffer_state *state)
 {
@@ -449,21 +449,21 @@ void util_blitter_save_framebuffer(struct blitter_context *blitter,
    util_copy_framebuffer_state(&blitter->saved_fb_state, state);
 }
 
-static INLINE
+static inline
 void util_blitter_save_viewport(struct blitter_context *blitter,
                                 struct pipe_viewport_state *state)
 {
    blitter->saved_viewport = *state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_scissor(struct blitter_context *blitter,
                                struct pipe_scissor_state *state)
 {
    blitter->saved_scissor = *state;
 }
 
-static INLINE
+static inline
 void util_blitter_save_fragment_sampler_states(
                   struct blitter_context *blitter,
                   unsigned num_sampler_states,
@@ -476,7 +476,7 @@ void util_blitter_save_fragment_sampler_states(
           num_sampler_states * sizeof(void *));
 }
 
-static INLINE void
+static inline void
 util_blitter_save_fragment_sampler_views(struct blitter_context *blitter,
                                          unsigned num_views,
                                          struct pipe_sampler_view **views)
@@ -490,7 +490,7 @@ util_blitter_save_fragment_sampler_views(struct blitter_context *blitter,
                                   views[i]);
 }
 
-static INLINE void
+static inline void
 util_blitter_save_vertex_buffer_slot(struct blitter_context *blitter,
                                      struct pipe_vertex_buffer *vertex_buffers)
 {
@@ -500,7 +500,7 @@ util_blitter_save_vertex_buffer_slot(struct blitter_context *blitter,
           sizeof(struct pipe_vertex_buffer));
 }
 
-static INLINE void
+static inline void
 util_blitter_save_so_targets(struct blitter_context *blitter,
                              unsigned num_targets,
                              struct pipe_stream_output_target **targets)
@@ -514,7 +514,7 @@ util_blitter_save_so_targets(struct blitter_context *blitter,
                                targets[i]);
 }
 
-static INLINE void
+static inline void
 util_blitter_save_sample_mask(struct blitter_context *blitter,
                               unsigned sample_mask)
 {
@@ -522,7 +522,7 @@ util_blitter_save_sample_mask(struct blitter_context *blitter,
    blitter->saved_sample_mask = sample_mask;
 }
 
-static INLINE void
+static inline void
 util_blitter_save_render_condition(struct blitter_context *blitter,
                                    struct pipe_query *query,
                                    boolean condition,
diff --git a/src/gallium/auxiliary/util/u_box.h b/src/gallium/auxiliary/util/u_box.h
index 520a3d596cb..66cf989a830 100644
--- a/src/gallium/auxiliary/util/u_box.h
+++ b/src/gallium/auxiliary/util/u_box.h
@@ -4,7 +4,7 @@
 #include "pipe/p_state.h"
 #include "util/u_math.h"
 
-static INLINE
+static inline
 void u_box_1d( unsigned x,
 	       unsigned w,
 	       struct pipe_box *box )
@@ -17,7 +17,7 @@ void u_box_1d( unsigned x,
    box->depth = 1;
 }
 
-static INLINE
+static inline
 void u_box_2d( unsigned x,
 	       unsigned y,
 	       unsigned w,
@@ -32,7 +32,7 @@ void u_box_2d( unsigned x,
    box->depth = 1;
 }
 
-static INLINE
+static inline
 void u_box_origin_2d( unsigned w,
 		      unsigned h,
 		      struct pipe_box *box )
@@ -45,7 +45,7 @@ void u_box_origin_2d( unsigned w,
    box->depth = 1;
 }
 
-static INLINE
+static inline
 void u_box_2d_zslice( unsigned x,
 		      unsigned y,
 		      unsigned z,
@@ -61,7 +61,7 @@ void u_box_2d_zslice( unsigned x,
    box->depth = 1;
 }
 
-static INLINE
+static inline
 void u_box_3d( unsigned x,
 	       unsigned y,
 	       unsigned z,
@@ -86,7 +86,7 @@ void u_box_3d( unsigned x,
  *          3 if both width and height have been reduced.
  * Aliasing permitted.
  */
-static INLINE int
+static inline int
 u_box_clip_2d(struct pipe_box *dst,
               const struct pipe_box *box, int w, int h)
 {
@@ -129,14 +129,14 @@ u_box_clip_2d(struct pipe_box *dst,
    return res;
 }
 
-static INLINE int64_t
+static inline int64_t
 u_box_volume_3d(const struct pipe_box *box)
 {
    return (int64_t)box->width * box->height * box->depth;
 }
 
 /* Aliasing of @dst permitted. */
-static INLINE void
+static inline void
 u_box_union_2d(struct pipe_box *dst,
                const struct pipe_box *a, const struct pipe_box *b)
 {
@@ -148,7 +148,7 @@ u_box_union_2d(struct pipe_box *dst,
 }
 
 /* Aliasing of @dst permitted. */
-static INLINE void
+static inline void
 u_box_union_3d(struct pipe_box *dst,
                const struct pipe_box *a, const struct pipe_box *b)
 {
@@ -161,7 +161,7 @@ u_box_union_3d(struct pipe_box *dst,
    dst->depth = MAX2(a->z + a->depth, b->z + b->depth) - dst->z;
 }
 
-static INLINE boolean
+static inline boolean
 u_box_test_intersection_2d(const struct pipe_box *a,
                            const struct pipe_box *b)
 {
@@ -185,7 +185,7 @@ u_box_test_intersection_2d(const struct pipe_box *a,
    return TRUE;
 }
 
-static INLINE void
+static inline void
 u_box_minify_2d(struct pipe_box *dst,
                 const struct pipe_box *src, unsigned l)
 {
diff --git a/src/gallium/auxiliary/util/u_cache.c b/src/gallium/auxiliary/util/u_cache.c
index 9395c66f2f8..da0856981eb 100644
--- a/src/gallium/auxiliary/util/u_cache.c
+++ b/src/gallium/auxiliary/util/u_cache.c
@@ -155,7 +155,7 @@ util_cache_entry_get(struct util_cache *cache,
    return NULL;
 }
 
-static INLINE void
+static inline void
 util_cache_entry_destroy(struct util_cache *cache,
                          struct util_cache_entry *entry)
 {
diff --git a/src/gallium/auxiliary/util/u_clear.h b/src/gallium/auxiliary/util/u_clear.h
index af557be00bd..864d1302b4f 100644
--- a/src/gallium/auxiliary/util/u_clear.h
+++ b/src/gallium/auxiliary/util/u_clear.h
@@ -37,7 +37,7 @@
  * Clear the given buffers to the specified values.
  * No masking, no scissor (clear entire buffer).
  */
-static INLINE void
+static inline void
 util_clear(struct pipe_context *pipe,
            struct pipe_framebuffer_state *framebuffer, unsigned buffers,
            const union pipe_color_union *color, double depth, unsigned stencil)
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index 23ab46c54bc..d1f9e978682 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -179,7 +179,7 @@ static int has_cpuid(void)
  * @sa cpuid.h included in gcc-4.3 onwards.
  * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
  */
-static INLINE void
+static inline void
 cpuid(uint32_t ax, uint32_t *p)
 {
 #if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
@@ -216,7 +216,7 @@ cpuid(uint32_t ax, uint32_t *p)
  * @sa cpuid.h included in gcc-4.4 onwards.
  * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
  */
-static INLINE void
+static inline void
 cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
 {
 #if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
@@ -250,7 +250,7 @@ cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
 }
 
 
-static INLINE uint64_t xgetbv(void)
+static inline uint64_t xgetbv(void)
 {
 #if defined(PIPE_CC_GCC)
    uint32_t eax, edx;
@@ -272,7 +272,7 @@ static INLINE uint64_t xgetbv(void)
 
 
 #if defined(PIPE_ARCH_X86)
-PIPE_ALIGN_STACK static INLINE boolean sse2_has_daz(void)
+PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void)
 {
    struct {
       uint32_t pad1[7];
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 2d2d049b205..b4503deb8f6 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -256,12 +256,12 @@ static boolean str_has_option(const char *str, const char *name)
    return FALSE;
 }
 
-unsigned long
+uint64_t
 debug_get_flags_option(const char *name, 
                        const struct debug_named_value *flags,
-                       unsigned long dfault)
+                       uint64_t dfault)
 {
-   unsigned long result;
+   uint64_t result;
    const char *str;
    const struct debug_named_value *orig = flags;
    unsigned namealign = 0;
@@ -276,7 +276,7 @@ debug_get_flags_option(const char *name,
          namealign = MAX2(namealign, strlen(flags->name));
       for (flags = orig; flags->name; ++flags)
          _debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name,
-                      (int)sizeof(unsigned long)*CHAR_BIT/4, flags->value,
+                      (int)sizeof(uint64_t)*CHAR_BIT/4, flags->value,
                       flags->desc ? " " : "", flags->desc ? flags->desc : "");
    }
    else {
@@ -758,7 +758,8 @@ debug_print_bind_flags(const char *msg, unsigned usage)
       DEBUG_NAMED_VALUE(PIPE_BIND_CURSOR),
       DEBUG_NAMED_VALUE(PIPE_BIND_CUSTOM),
       DEBUG_NAMED_VALUE(PIPE_BIND_GLOBAL),
-      DEBUG_NAMED_VALUE(PIPE_BIND_SHADER_RESOURCE),
+      DEBUG_NAMED_VALUE(PIPE_BIND_SHADER_BUFFER),
+      DEBUG_NAMED_VALUE(PIPE_BIND_SHADER_IMAGE),
       DEBUG_NAMED_VALUE(PIPE_BIND_COMPUTE_RESOURCE),
       DEBUG_NAMED_VALUE(PIPE_BIND_COMMAND_ARGS_BUFFER),
       DEBUG_NAMED_VALUE(PIPE_BIND_SCANOUT),
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 3b2255244a7..926063a1918 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -58,7 +58,7 @@ extern "C" {
 void _debug_vprintf(const char *format, va_list ap);
    
 
-static INLINE void
+static inline void
 _debug_printf(const char *format, ...)
 {
    va_list ap;
@@ -78,10 +78,10 @@ _debug_printf(const char *format, ...)
  * that is guaranteed to be printed in all platforms)
  */
 #if !defined(PIPE_OS_HAIKU)
-static INLINE void
+static inline void
 debug_printf(const char *format, ...) _util_printf_format(1,2);
 
-static INLINE void
+static inline void
 debug_printf(const char *format, ...)
 {
 #ifdef DEBUG
@@ -269,7 +269,7 @@ void _debug_assert_fail(const char *expr,
 struct debug_named_value
 {
    const char *name;
-   unsigned long value;
+   uint64_t value;
    const char *desc;
 };
 
@@ -377,10 +377,10 @@ debug_get_bool_option(const char *name, boolean dfault);
 long
 debug_get_num_option(const char *name, long dfault);
 
-unsigned long
+uint64_t
 debug_get_flags_option(const char *name, 
                        const struct debug_named_value *flags,
-                       unsigned long dfault);
+                       uint64_t dfault);
 
 #define DEBUG_GET_ONCE_BOOL_OPTION(sufix, name, dfault) \
 static boolean \
diff --git a/src/gallium/auxiliary/util/u_debug_describe.c b/src/gallium/auxiliary/util/u_debug_describe.c
index df73ed83ef6..f428d22d205 100644
--- a/src/gallium/auxiliary/util/u_debug_describe.c
+++ b/src/gallium/auxiliary/util/u_debug_describe.c
@@ -80,6 +80,15 @@ debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr)
    util_sprintf(buf, "pipe_sampler_view<%s,%s>", res, util_format_short_name(ptr->format));
 }
 
+void
+debug_describe_image_view(char* buf, const struct pipe_image_view *ptr)
+{
+   char res[128];
+   debug_describe_resource(res, ptr->resource);
+   util_sprintf(buf, "pipe_image_view<%s,%s>", res,
+                util_format_short_name(ptr->format));
+}
+
 void
 debug_describe_so_target(char* buf,
                          const struct pipe_stream_output_target *ptr)
diff --git a/src/gallium/auxiliary/util/u_debug_describe.h b/src/gallium/auxiliary/util/u_debug_describe.h
index 4f7882b0b37..2172ecb4395 100644
--- a/src/gallium/auxiliary/util/u_debug_describe.h
+++ b/src/gallium/auxiliary/util/u_debug_describe.h
@@ -35,12 +35,14 @@ struct pipe_reference;
 struct pipe_resource;
 struct pipe_surface;
 struct pipe_sampler_view;
+struct pipe_image_view;
 
 /* a 256-byte buffer is necessary and sufficient */
 void debug_describe_reference(char* buf, const struct pipe_reference*ptr);
 void debug_describe_resource(char* buf, const struct pipe_resource *ptr);
 void debug_describe_surface(char* buf, const struct pipe_surface *ptr);
 void debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr);
+void debug_describe_image_view(char* buf, const struct pipe_image_view *ptr);
 void debug_describe_so_target(char* buf,
                               const struct pipe_stream_output_target *ptr);
 
diff --git a/src/gallium/auxiliary/util/u_debug_memory.c b/src/gallium/auxiliary/util/u_debug_memory.c
index 747837cd148..3e7ecfa79f3 100644
--- a/src/gallium/auxiliary/util/u_debug_memory.c
+++ b/src/gallium/auxiliary/util/u_debug_memory.c
@@ -92,7 +92,7 @@ pipe_static_mutex(list_mutex);
 static unsigned long last_no = 0;
 
 
-static INLINE struct debug_memory_header *
+static inline struct debug_memory_header *
 header_from_data(void *data)
 {
    if(data)
@@ -101,7 +101,7 @@ header_from_data(void *data)
       return NULL;
 }
 
-static INLINE void *
+static inline void *
 data_from_header(struct debug_memory_header *hdr)
 {
    if(hdr)
@@ -110,7 +110,7 @@ data_from_header(struct debug_memory_header *hdr)
       return NULL;
 }
 
-static INLINE struct debug_memory_footer *
+static inline struct debug_memory_footer *
 footer_from_header(struct debug_memory_header *hdr)
 {
    if(hdr)
diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h
index c02fba27ddf..1f9218fec9a 100644
--- a/src/gallium/auxiliary/util/u_debug_refcnt.h
+++ b/src/gallium/auxiliary/util/u_debug_refcnt.h
@@ -42,7 +42,7 @@ extern int debug_refcnt_state;
 
 void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change);
 
-static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+static inline void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
 {
    if (debug_refcnt_state >= 0)
       debug_reference_slowpath(p, get_desc, change);
@@ -50,7 +50,7 @@ static INLINE void debug_reference(const struct pipe_reference* p, debug_referen
 
 #else
 
-static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+static inline void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
 {
 }
 
diff --git a/src/gallium/auxiliary/util/u_debug_symbol.c b/src/gallium/auxiliary/util/u_debug_symbol.c
index 542493252ce..10efdd593e5 100644
--- a/src/gallium/auxiliary/util/u_debug_symbol.c
+++ b/src/gallium/auxiliary/util/u_debug_symbol.c
@@ -146,7 +146,7 @@ DBGHELP_DISPATCH(SymGetLineFromAddr64,
 #undef DBGHELP_DISPATCH
 
 
-static INLINE boolean
+static inline boolean
 debug_symbol_name_dbghelp(const void *addr, char* buf, unsigned size)
 {
    DWORD64 dwAddr = (DWORD64)(uintptr_t)addr;
@@ -227,7 +227,7 @@ debug_symbol_name_dbghelp(const void *addr, char* buf, unsigned size)
  *
  * To fix this, post-process the output with tools/addr2line.sh
  */
-static INLINE boolean
+static inline boolean
 debug_symbol_name_glibc(const void *addr, char* buf, unsigned size)
 {
    char** syms = backtrace_symbols((void**)&addr, 1);
diff --git a/src/gallium/auxiliary/util/u_dirty_surfaces.h b/src/gallium/auxiliary/util/u_dirty_surfaces.h
index d31f8b9170a..ccde8a8c115 100644
--- a/src/gallium/auxiliary/util/u_dirty_surfaces.h
+++ b/src/gallium/auxiliary/util/u_dirty_surfaces.h
@@ -47,13 +47,13 @@ struct util_dirty_surface
    struct list_head dirty_list;
 };
 
-static INLINE void
+static inline void
 util_dirty_surfaces_init(struct util_dirty_surfaces *ds)
 {
    LIST_INITHEAD(&ds->dirty_list);
 }
 
-static INLINE void
+static inline void
 util_dirty_surfaces_use_for_sampling(struct pipe_context *pipe, struct util_dirty_surfaces *dss, util_dirty_surface_flush_t flush)
 {
    struct list_head *p, *next;
@@ -66,7 +66,7 @@ util_dirty_surfaces_use_for_sampling(struct pipe_context *pipe, struct util_dirt
    }
 }
 
-static INLINE void
+static inline void
 util_dirty_surfaces_use_levels_for_sampling(struct pipe_context *pipe, struct util_dirty_surfaces *dss, unsigned first, unsigned last, util_dirty_surface_flush_t flush)
 {
    struct list_head *p, *next;
@@ -82,7 +82,7 @@ util_dirty_surfaces_use_levels_for_sampling(struct pipe_context *pipe, struct ut
    }
 }
 
-static INLINE void
+static inline void
 util_dirty_surfaces_use_for_sampling_with(struct pipe_context *pipe, struct util_dirty_surfaces *dss, struct pipe_sampler_view *psv, struct pipe_sampler_state *pss, util_dirty_surface_flush_t flush)
 {
    if(!LIST_IS_EMPTY(&dss->dirty_list))
@@ -90,26 +90,26 @@ util_dirty_surfaces_use_for_sampling_with(struct pipe_context *pipe, struct util
 						  MIN2((unsigned)ceilf(pss->max_lod) + psv->u.tex.first_level, psv->u.tex.last_level), flush);
 }
 
-static INLINE void
+static inline void
 util_dirty_surface_init(struct util_dirty_surface *ds)
 {
    LIST_INITHEAD(&ds->dirty_list);
 }
 
-static INLINE boolean
+static inline boolean
 util_dirty_surface_is_dirty(struct util_dirty_surface *ds)
 {
    return !LIST_IS_EMPTY(&ds->dirty_list);
 }
 
-static INLINE void
+static inline void
 util_dirty_surface_set_dirty(struct util_dirty_surfaces *dss, struct util_dirty_surface *ds)
 {
    if(LIST_IS_EMPTY(&ds->dirty_list))
       LIST_ADDTAIL(&ds->dirty_list, &dss->dirty_list);
 }
 
-static INLINE void
+static inline void
 util_dirty_surface_set_clean(struct util_dirty_surfaces *dss, struct util_dirty_surface *ds)
 {
    if(!LIST_IS_EMPTY(&ds->dirty_list))
diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h
index 9fc3e9924e1..5c0880f6ce4 100644
--- a/src/gallium/auxiliary/util/u_draw.h
+++ b/src/gallium/auxiliary/util/u_draw.h
@@ -39,7 +39,7 @@ extern "C" {
 #endif
 
 
-static INLINE void
+static inline void
 util_draw_init_info(struct pipe_draw_info *info)
 {
    memset(info, 0, sizeof(*info));
@@ -48,7 +48,7 @@ util_draw_init_info(struct pipe_draw_info *info)
 }
 
 
-static INLINE void
+static inline void
 util_draw_arrays(struct pipe_context *pipe, uint mode, uint start, uint count)
 {
    struct pipe_draw_info info;
@@ -63,7 +63,7 @@ util_draw_arrays(struct pipe_context *pipe, uint mode, uint start, uint count)
    pipe->draw_vbo(pipe, &info);
 }
 
-static INLINE void
+static inline void
 util_draw_elements(struct pipe_context *pipe, int index_bias,
                    uint mode, uint start, uint count)
 {
@@ -79,7 +79,7 @@ util_draw_elements(struct pipe_context *pipe, int index_bias,
    pipe->draw_vbo(pipe, &info);
 }
 
-static INLINE void
+static inline void
 util_draw_arrays_instanced(struct pipe_context *pipe,
                            uint mode, uint start, uint count,
                            uint start_instance,
@@ -99,7 +99,7 @@ util_draw_arrays_instanced(struct pipe_context *pipe,
    pipe->draw_vbo(pipe, &info);
 }
 
-static INLINE void
+static inline void
 util_draw_elements_instanced(struct pipe_context *pipe,
                              int index_bias,
                              uint mode, uint start, uint count,
@@ -120,7 +120,7 @@ util_draw_elements_instanced(struct pipe_context *pipe,
    pipe->draw_vbo(pipe, &info);
 }
 
-static INLINE void
+static inline void
 util_draw_range_elements(struct pipe_context *pipe,
                          int index_bias,
                          uint min_index,
diff --git a/src/gallium/auxiliary/util/u_dual_blend.h b/src/gallium/auxiliary/util/u_dual_blend.h
index e31d43c18bd..9450800f715 100644
--- a/src/gallium/auxiliary/util/u_dual_blend.h
+++ b/src/gallium/auxiliary/util/u_dual_blend.h
@@ -3,7 +3,7 @@
 
 #include "pipe/p_state.h"
 
-static INLINE boolean util_blend_factor_is_dual_src(int factor)
+static inline boolean util_blend_factor_is_dual_src(int factor)
 {
    return (factor == PIPE_BLENDFACTOR_SRC1_COLOR) ||
           (factor == PIPE_BLENDFACTOR_SRC1_ALPHA) ||
@@ -11,7 +11,7 @@ static INLINE boolean util_blend_factor_is_dual_src(int factor)
           (factor == PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
 }
 
-static INLINE boolean util_blend_state_is_dual(const struct pipe_blend_state *blend, 
+static inline boolean util_blend_state_is_dual(const struct pipe_blend_state *blend, 
 				  int index)
 {
    if (util_blend_factor_is_dual_src(blend->rt[index].rgb_src_factor) ||
diff --git a/src/gallium/auxiliary/util/u_dump.h b/src/gallium/auxiliary/util/u_dump.h
index 58e7dfd8244..2598851152b 100644
--- a/src/gallium/auxiliary/util/u_dump.h
+++ b/src/gallium/auxiliary/util/u_dump.h
@@ -88,14 +88,16 @@ util_dump_tex_filter(unsigned value, boolean shortened);
 const char *
 util_dump_query_type(unsigned value, boolean shortened);
 
+const char *
+util_dump_prim_mode(unsigned value, boolean shortened);
+
 
 /*
  * p_state.h, through a FILE
  */
 
 void
-util_dump_template(FILE *stream,
-                   const struct pipe_resource *templat);
+util_dump_resource(FILE *stream, const struct pipe_resource *state);
 
 void
 util_dump_rasterizer_state(FILE *stream,
@@ -153,10 +155,23 @@ void
 util_dump_surface(FILE *stream,
                   const struct pipe_surface *state);
 
+void
+util_dump_image_view(FILE *stream, const struct pipe_image_view *state);
+
+void
+util_dump_sampler_view(FILE *stream, const struct pipe_sampler_view *state);
+
 void
 util_dump_transfer(FILE *stream,
                    const struct pipe_transfer *state);
 
+void
+util_dump_constant_buffer(FILE *stream,
+                          const struct pipe_constant_buffer *state);
+
+void
+util_dump_index_buffer(FILE *stream, const struct pipe_index_buffer *state);
+
 void
 util_dump_vertex_buffer(FILE *stream,
                         const struct pipe_vertex_buffer *state);
@@ -165,6 +180,10 @@ void
 util_dump_vertex_element(FILE *stream,
                          const struct pipe_vertex_element *state);
 
+void
+util_dump_stream_output_target(FILE *stream,
+                               const struct pipe_stream_output_target *state);
+
 void
 util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state);
 
diff --git a/src/gallium/auxiliary/util/u_dump_defines.c b/src/gallium/auxiliary/util/u_dump_defines.c
index 03fd15d0c44..3ddc9554b50 100644
--- a/src/gallium/auxiliary/util/u_dump_defines.c
+++ b/src/gallium/auxiliary/util/u_dump_defines.c
@@ -392,3 +392,44 @@ util_dump_query_type_short_names[] = {
 };
 
 DEFINE_UTIL_DUMP_CONTINUOUS(query_type)
+
+
+static const char *
+util_dump_prim_mode_names[] = {
+   "PIPE_PRIM_POINTS",
+   "PIPE_PRIM_LINES",
+   "PIPE_PRIM_LINE_LOOP",
+   "PIPE_PRIM_LINE_STRIP",
+   "PIPE_PRIM_TRIANGLES",
+   "PIPE_PRIM_TRIANGLE_STRIP",
+   "PIPE_PRIM_TRIANGLE_FAN",
+   "PIPE_PRIM_QUADS",
+   "PIPE_PRIM_QUAD_STRIP",
+   "PIPE_PRIM_POLYGON",
+   "PIPE_PRIM_LINES_ADJACENCY",
+   "PIPE_PRIM_LINE_STRIP_ADJACENCY",
+   "PIPE_PRIM_TRIANGLES_ADJACENCY",
+   "PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY",
+   "PIPE_PRIM_PATCHES",
+};
+
+static const char *
+util_dump_prim_mode_short_names[] = {
+   "points",
+   "lines",
+   "line_loop",
+   "line_strip",
+   "triangles",
+   "triangle_strip",
+   "triangle_fan",
+   "quads",
+   "quad_strip",
+   "polygon",
+   "lines_adjacency",
+   "line_strip_adjacency",
+   "triangles_adjacency",
+   "triangle_strip_adjacency",
+   "patches",
+};
+
+DEFINE_UTIL_DUMP_CONTINUOUS(prim_mode)
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index 7f620b50cf0..441d16236b5 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -39,7 +39,7 @@
  * Dump primitives
  */
 
-static INLINE void
+static inline void
 util_stream_writef(FILE *stream, const char *format, ...)
 {
    static char buf[1024];
@@ -247,6 +247,42 @@ util_dump_enum_func(FILE *stream, unsigned value)
    util_dump_enum(stream, util_dump_func(value, TRUE));
 }
 
+static void
+util_dump_enum_prim_mode(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_prim_mode(value, TRUE));
+}
+
+static void
+util_dump_enum_tex_target(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_tex_target(value, TRUE));
+}
+
+static void
+util_dump_enum_tex_filter(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_tex_filter(value, TRUE));
+}
+
+static void
+util_dump_enum_tex_mipfilter(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_tex_mipfilter(value, TRUE));
+}
+
+static void
+util_dump_enum_tex_wrap(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_tex_wrap(value, TRUE));
+}
+
+static void
+util_dump_enum_stencil_op(FILE *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_stencil_op(value, TRUE));
+}
+
 
 /*
  * Public functions
@@ -254,38 +290,28 @@ util_dump_enum_func(FILE *stream, unsigned value)
 
 
 void
-util_dump_template(FILE *stream, const struct pipe_resource *templat)
+util_dump_resource(FILE *stream, const struct pipe_resource *state)
 {
-   if(!templat) {
+   if (!state) {
       util_dump_null(stream);
       return;
    }
 
    util_dump_struct_begin(stream, "pipe_resource");
 
-   util_dump_member(stream, int, templat, target);
-   util_dump_member(stream, format, templat, format);
+   util_dump_member(stream, enum_tex_target, state, target);
+   util_dump_member(stream, format, state, format);
 
-   util_dump_member_begin(stream, "width");
-   util_dump_uint(stream, templat->width0);
-   util_dump_member_end(stream);
+   util_dump_member(stream, uint, state, width0);
+   util_dump_member(stream, uint, state, height0);
+   util_dump_member(stream, uint, state, depth0);
+   util_dump_member(stream, uint, state, array_size);
 
-   util_dump_member_begin(stream, "height");
-   util_dump_uint(stream, templat->height0);
-   util_dump_member_end(stream);
-
-   util_dump_member_begin(stream, "depth");
-   util_dump_uint(stream, templat->depth0);
-   util_dump_member_end(stream);
-
-   util_dump_member_begin(stream, "array_size");
-   util_dump_uint(stream, templat->array_size);
-   util_dump_member_end(stream);
-
-   util_dump_member(stream, uint, templat, last_level);
-   util_dump_member(stream, uint, templat, usage);
-   util_dump_member(stream, uint, templat, bind);
-   util_dump_member(stream, uint, templat, flags);
+   util_dump_member(stream, uint, state, last_level);
+   util_dump_member(stream, uint, state, nr_samples);
+   util_dump_member(stream, uint, state, usage);
+   util_dump_member(stream, uint, state, bind);
+   util_dump_member(stream, uint, state, flags);
 
    util_dump_struct_end(stream);
 }
@@ -319,6 +345,7 @@ util_dump_rasterizer_state(FILE *stream, const struct pipe_rasterizer_state *sta
    util_dump_member(stream, uint, state, sprite_coord_enable);
    util_dump_member(stream, bool, state, sprite_coord_mode);
    util_dump_member(stream, bool, state, point_quad_rasterization);
+   util_dump_member(stream, bool, state, point_tri_clip);
    util_dump_member(stream, bool, state, point_size_per_vertex);
    util_dump_member(stream, bool, state, multisample);
    util_dump_member(stream, bool, state, line_smooth);
@@ -331,6 +358,7 @@ util_dump_rasterizer_state(FILE *stream, const struct pipe_rasterizer_state *sta
    util_dump_member(stream, bool, state, bottom_edge_rule);
    util_dump_member(stream, bool, state, rasterizer_discard);
    util_dump_member(stream, bool, state, depth_clip);
+   util_dump_member(stream, bool, state, clip_halfz);
    util_dump_member(stream, uint, state, clip_plane_enable);
 
    util_dump_member(stream, float, state, line_width);
@@ -426,7 +454,6 @@ util_dump_clip_state(FILE *stream, const struct pipe_clip_state *state)
 void
 util_dump_shader_state(FILE *stream, const struct pipe_shader_state *state)
 {
-   char str[8192];
    unsigned i;
 
    if(!state) {
@@ -434,33 +461,35 @@ util_dump_shader_state(FILE *stream, const struct pipe_shader_state *state)
       return;
    }
 
-   tgsi_dump_str(state->tokens, 0, str, sizeof(str));
-
    util_dump_struct_begin(stream, "pipe_shader_state");
 
    util_dump_member_begin(stream, "tokens");
-   util_dump_string(stream, str);
+   fprintf(stream, "\"\n");
+   tgsi_dump_to_file(state->tokens, 0, stream);
+   fprintf(stream, "\"");
    util_dump_member_end(stream);
 
-   util_dump_member_begin(stream, "stream_output");
-   util_dump_struct_begin(stream, "pipe_stream_output_info");
-   util_dump_member(stream, uint, &state->stream_output, num_outputs);
-   util_dump_array(stream, uint, state->stream_output.stride,
-                   Elements(state->stream_output.stride));
-   util_dump_array_begin(stream);
-   for(i = 0; i < state->stream_output.num_outputs; ++i) {
-      util_dump_elem_begin(stream);
-      util_dump_struct_begin(stream, ""); /* anonymous */
-      util_dump_member(stream, uint, &state->stream_output.output[i], register_index);
-      util_dump_member(stream, uint, &state->stream_output.output[i], start_component);
-      util_dump_member(stream, uint, &state->stream_output.output[i], num_components);
-      util_dump_member(stream, uint, &state->stream_output.output[i], output_buffer);
+   if (state->stream_output.num_outputs) {
+      util_dump_member_begin(stream, "stream_output");
+      util_dump_struct_begin(stream, "pipe_stream_output_info");
+      util_dump_member(stream, uint, &state->stream_output, num_outputs);
+      util_dump_array(stream, uint, state->stream_output.stride,
+                      Elements(state->stream_output.stride));
+      util_dump_array_begin(stream);
+      for(i = 0; i < state->stream_output.num_outputs; ++i) {
+         util_dump_elem_begin(stream);
+         util_dump_struct_begin(stream, ""); /* anonymous */
+         util_dump_member(stream, uint, &state->stream_output.output[i], register_index);
+         util_dump_member(stream, uint, &state->stream_output.output[i], start_component);
+         util_dump_member(stream, uint, &state->stream_output.output[i], num_components);
+         util_dump_member(stream, uint, &state->stream_output.output[i], output_buffer);
+         util_dump_struct_end(stream);
+         util_dump_elem_end(stream);
+      }
+      util_dump_array_end(stream);
       util_dump_struct_end(stream);
-      util_dump_elem_end(stream);
+      util_dump_member_end(stream);
    }
-   util_dump_array_end(stream);
-   util_dump_struct_end(stream);
-   util_dump_member_end(stream);
 
    util_dump_struct_end(stream);
 }
@@ -496,9 +525,12 @@ util_dump_depth_stencil_alpha_state(FILE *stream, const struct pipe_depth_stenci
       util_dump_member(stream, bool, &state->stencil[i], enabled);
       if (state->stencil[i].enabled) {
          util_dump_member(stream, enum_func, &state->stencil[i], func);
-         util_dump_member(stream, uint, &state->stencil[i], fail_op);
-         util_dump_member(stream, uint, &state->stencil[i], zpass_op);
-         util_dump_member(stream, uint, &state->stencil[i], zfail_op);
+         util_dump_member(stream, enum_stencil_op,
+                          &state->stencil[i], fail_op);
+         util_dump_member(stream, enum_stencil_op,
+                          &state->stencil[i], zpass_op);
+         util_dump_member(stream, enum_stencil_op,
+                          &state->stencil[i], zfail_op);
          util_dump_member(stream, uint, &state->stencil[i], valuemask);
          util_dump_member(stream, uint, &state->stencil[i], writemask);
       }
@@ -555,6 +587,8 @@ util_dump_blend_state(FILE *stream, const struct pipe_blend_state *state)
    util_dump_struct_begin(stream, "pipe_blend_state");
 
    util_dump_member(stream, bool, state, dither);
+   util_dump_member(stream, bool, state, alpha_to_coverage);
+   util_dump_member(stream, bool, state, alpha_to_one);
 
    util_dump_member(stream, bool, state, logicop_enable);
    if (state->logicop_enable) {
@@ -629,16 +663,17 @@ util_dump_sampler_state(FILE *stream, const struct pipe_sampler_state *state)
 
    util_dump_struct_begin(stream, "pipe_sampler_state");
 
-   util_dump_member(stream, uint, state, wrap_s);
-   util_dump_member(stream, uint, state, wrap_t);
-   util_dump_member(stream, uint, state, wrap_r);
-   util_dump_member(stream, uint, state, min_img_filter);
-   util_dump_member(stream, uint, state, min_mip_filter);
-   util_dump_member(stream, uint, state, mag_img_filter);
+   util_dump_member(stream, enum_tex_wrap, state, wrap_s);
+   util_dump_member(stream, enum_tex_wrap, state, wrap_t);
+   util_dump_member(stream, enum_tex_wrap, state, wrap_r);
+   util_dump_member(stream, enum_tex_filter, state, min_img_filter);
+   util_dump_member(stream, enum_tex_mipfilter, state, min_mip_filter);
+   util_dump_member(stream, enum_tex_filter, state, mag_img_filter);
    util_dump_member(stream, uint, state, compare_mode);
    util_dump_member(stream, enum_func, state, compare_func);
    util_dump_member(stream, bool, state, normalized_coords);
    util_dump_member(stream, uint, state, max_anisotropy);
+   util_dump_member(stream, bool, state, seamless_cube_map);
    util_dump_member(stream, float, state, lod_bias);
    util_dump_member(stream, float, state, min_lod);
    util_dump_member(stream, float, state, max_lod);
@@ -671,6 +706,67 @@ util_dump_surface(FILE *stream, const struct pipe_surface *state)
 }
 
 
+void
+util_dump_image_view(FILE *stream, const struct pipe_image_view *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_image_view");
+
+   util_dump_member(stream, ptr, state, resource);
+   util_dump_member(stream, format, state, format);
+
+   if (state->resource->target == PIPE_BUFFER) {
+      util_dump_member(stream, uint, state, u.buf.first_element);
+      util_dump_member(stream, uint, state, u.buf.last_element);
+   }
+   else {
+      util_dump_member(stream, uint, state, u.tex.first_layer);
+      util_dump_member(stream, uint, state, u.tex.last_layer);
+      util_dump_member(stream, uint, state, u.tex.level);
+   }
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_sampler_view(FILE *stream, const struct pipe_sampler_view *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_sampler_view");
+
+   util_dump_member(stream, enum_tex_target, state, target);
+   util_dump_member(stream, format, state, format);
+   util_dump_member(stream, ptr, state, texture);
+
+   if (state->target == PIPE_BUFFER) {
+      util_dump_member(stream, uint, state, u.buf.first_element);
+      util_dump_member(stream, uint, state, u.buf.last_element);
+   }
+   else {
+      util_dump_member(stream, uint, state, u.tex.first_layer);
+      util_dump_member(stream, uint, state, u.tex.last_layer);
+      util_dump_member(stream, uint, state, u.tex.last_level);
+      util_dump_member(stream, uint, state, u.tex.last_level);
+   }
+
+   util_dump_member(stream, uint, state, swizzle_r);
+   util_dump_member(stream, uint, state, swizzle_g);
+   util_dump_member(stream, uint, state, swizzle_b);
+   util_dump_member(stream, uint, state, swizzle_a);
+
+   util_dump_struct_end(stream);
+}
+
+
 void
 util_dump_transfer(FILE *stream, const struct pipe_transfer *state)
 {
@@ -694,6 +790,45 @@ util_dump_transfer(FILE *stream, const struct pipe_transfer *state)
 }
 
 
+void
+util_dump_constant_buffer(FILE *stream,
+                          const struct pipe_constant_buffer *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_constant_buffer");
+
+   util_dump_member(stream, ptr, state, buffer);
+   util_dump_member(stream, uint, state, buffer_offset);
+   util_dump_member(stream, uint, state, buffer_size);
+   util_dump_member(stream, ptr, state, user_buffer);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_index_buffer(FILE *stream, const struct pipe_index_buffer *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_index_buffer");
+
+   util_dump_member(stream, uint, state, index_size);
+   util_dump_member(stream, uint, state, offset);
+   util_dump_member(stream, ptr, state, buffer);
+   util_dump_member(stream, ptr, state, user_buffer);
+
+   util_dump_struct_end(stream);
+}
+
+
 void
 util_dump_vertex_buffer(FILE *stream, const struct pipe_vertex_buffer *state)
 {
@@ -707,6 +842,7 @@ util_dump_vertex_buffer(FILE *stream, const struct pipe_vertex_buffer *state)
    util_dump_member(stream, uint, state, stride);
    util_dump_member(stream, uint, state, buffer_offset);
    util_dump_member(stream, ptr, state, buffer);
+   util_dump_member(stream, ptr, state, user_buffer);
 
    util_dump_struct_end(stream);
 }
@@ -731,6 +867,25 @@ util_dump_vertex_element(FILE *stream, const struct pipe_vertex_element *state)
 }
 
 
+void
+util_dump_stream_output_target(FILE *stream,
+                               const struct pipe_stream_output_target *state)
+{
+   if (!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_stream_output_target");
+
+   util_dump_member(stream, ptr, state, buffer);
+   util_dump_member(stream, uint, state, buffer_offset);
+   util_dump_member(stream, uint, state, buffer_size);
+
+   util_dump_struct_end(stream);
+}
+
+
 void
 util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state)
 {
@@ -743,7 +898,7 @@ util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state)
 
    util_dump_member(stream, bool, state, indexed);
 
-   util_dump_member(stream, uint, state, mode);
+   util_dump_member(stream, enum_prim_mode, state, mode);
    util_dump_member(stream, uint, state, start);
    util_dump_member(stream, uint, state, count);
 
@@ -830,12 +985,14 @@ void util_dump_blit_info(FILE *stream, const struct pipe_blit_info *info)
    util_dump_member_begin(stream, "mask");
    util_dump_string(stream, mask);
    util_dump_member_end(stream);
-   util_dump_member(stream, uint, info, filter);
+   util_dump_member(stream, enum_tex_filter, info, filter);
 
    util_dump_member(stream, bool, info, scissor_enable);
    util_dump_member_begin(stream, "scissor");
    util_dump_scissor_state(stream, &info->scissor);
    util_dump_member_end(stream);
 
+   util_dump_member(stream, bool, info, render_condition_enable);
+
    util_dump_struct_end(stream);
 }
diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h
index 980cadf22d1..7b7a093d824 100644
--- a/src/gallium/auxiliary/util/u_dynarray.h
+++ b/src/gallium/auxiliary/util/u_dynarray.h
@@ -43,13 +43,13 @@ struct util_dynarray
    unsigned capacity;
 };
 
-static INLINE void
+static inline void
 util_dynarray_init(struct util_dynarray *buf)
 {
    memset(buf, 0, sizeof(*buf));
 }
 
-static INLINE void
+static inline void
 util_dynarray_fini(struct util_dynarray *buf)
 {
    if(buf->data)
@@ -60,7 +60,7 @@ util_dynarray_fini(struct util_dynarray *buf)
 }
 
 /* use util_dynarray_trim to reduce the allocated storage */
-static INLINE void *
+static inline void *
 util_dynarray_resize(struct util_dynarray *buf, unsigned newsize)
 {
    char *p;
@@ -78,13 +78,13 @@ util_dynarray_resize(struct util_dynarray *buf, unsigned newsize)
    return p;
 }
 
-static INLINE void *
+static inline void *
 util_dynarray_grow(struct util_dynarray *buf, int diff)
 {
    return util_dynarray_resize(buf, buf->size + diff);
 }
 
-static INLINE void
+static inline void
 util_dynarray_trim(struct util_dynarray *buf)
 {
    if (buf->size != buf->capacity) {
diff --git a/src/gallium/auxiliary/util/u_fifo.h b/src/gallium/auxiliary/util/u_fifo.h
index 9e007de1ada..a7aad6179d9 100644
--- a/src/gallium/auxiliary/util/u_fifo.h
+++ b/src/gallium/auxiliary/util/u_fifo.h
@@ -36,7 +36,7 @@ struct util_fifo
    size_t size;
 };
 
-static INLINE struct util_fifo *
+static inline struct util_fifo *
 u_fifo_create(size_t size)
 {
    struct util_fifo *fifo;
@@ -50,7 +50,7 @@ u_fifo_create(size_t size)
    return fifo;
 }
 
-static INLINE boolean
+static inline boolean
 u_fifo_add(struct util_fifo *fifo, void *ptr)
 {
    void **array = (void**)&fifo[1];
@@ -67,7 +67,7 @@ u_fifo_add(struct util_fifo *fifo, void *ptr)
    return TRUE;
 }
 
-static INLINE boolean
+static inline boolean
 u_fifo_pop(struct util_fifo *fifo, void **ptr)
 {
    void **array = (void**)&fifo[1];
@@ -85,7 +85,7 @@ u_fifo_pop(struct util_fifo *fifo, void **ptr)
    return TRUE;
 }
 
-static INLINE void
+static inline void
 u_fifo_destroy(struct util_fifo *fifo)
 {
    FREE(fifo);
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index 621574c9673..42b39ff04fd 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -425,7 +425,7 @@ util_format_description(enum pipe_format format);
  * Format query functions.
  */
 
-static INLINE const char *
+static inline const char *
 util_format_name(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -438,7 +438,7 @@ util_format_name(enum pipe_format format)
    return desc->name;
 }
 
-static INLINE const char *
+static inline const char *
 util_format_short_name(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -454,7 +454,7 @@ util_format_short_name(enum pipe_format format)
 /**
  * Whether this format is plain, see UTIL_FORMAT_LAYOUT_PLAIN for more info.
  */
-static INLINE boolean
+static inline boolean
 util_format_is_plain(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -466,7 +466,7 @@ util_format_is_plain(enum pipe_format format)
    return desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ? TRUE : FALSE;
 }
 
-static INLINE boolean 
+static inline boolean 
 util_format_is_compressed(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -488,7 +488,7 @@ util_format_is_compressed(enum pipe_format format)
    }
 }
 
-static INLINE boolean 
+static inline boolean 
 util_format_is_s3tc(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -501,28 +501,28 @@ util_format_is_s3tc(enum pipe_format format)
    return desc->layout == UTIL_FORMAT_LAYOUT_S3TC ? TRUE : FALSE;
 }
 
-static INLINE boolean 
+static inline boolean 
 util_format_is_srgb(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
    return desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB;
 }
 
-static INLINE boolean
+static inline boolean
 util_format_has_depth(const struct util_format_description *desc)
 {
    return desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
           desc->swizzle[0] != UTIL_FORMAT_SWIZZLE_NONE;
 }
 
-static INLINE boolean
+static inline boolean
 util_format_has_stencil(const struct util_format_description *desc)
 {
    return desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
           desc->swizzle[1] != UTIL_FORMAT_SWIZZLE_NONE;
 }
 
-static INLINE boolean
+static inline boolean
 util_format_is_depth_or_stencil(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -536,7 +536,7 @@ util_format_is_depth_or_stencil(enum pipe_format format)
           util_format_has_stencil(desc);
 }
 
-static INLINE boolean
+static inline boolean
 util_format_is_depth_and_stencil(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -554,7 +554,7 @@ util_format_is_depth_and_stencil(enum pipe_format format)
 /**
  * Calculates the depth format type based upon the incoming format description.
  */
-static INLINE unsigned
+static inline unsigned
 util_get_depth_format_type(const struct util_format_description *desc)
 {
    unsigned depth_channel = desc->swizzle[0];
@@ -581,7 +581,7 @@ util_get_depth_format_mrd(const struct util_format_description *desc);
  * Return whether this is an RGBA, Z, S, or combined ZS format.
  * Useful for initializing pipe_blit_info::mask.
  */
-static INLINE unsigned
+static inline unsigned
 util_format_get_mask(enum pipe_format format)
 {
    const struct util_format_description *desc =
@@ -611,7 +611,7 @@ util_format_get_mask(enum pipe_format format)
  *
  * That is, the channels whose values are preserved.
  */
-static INLINE unsigned
+static inline unsigned
 util_format_colormask(const struct util_format_description *desc)
 {
    unsigned colormask;
@@ -643,7 +643,7 @@ util_format_colormask(const struct util_format_description *desc)
  * @param desc       a format description to check colormask with
  * @param colormask  a bit mask for channels, matches format of PIPE_MASK_RGBA
  */
-static INLINE boolean
+static inline boolean
 util_format_colormask_full(const struct util_format_description *desc, unsigned colormask)
 {
    return (~colormask & util_format_colormask(desc)) == 0;
@@ -709,7 +709,7 @@ util_format_is_supported(enum pipe_format format, unsigned bind);
  *
  *   PIPE_FORMAT_?8?8?8?8_UNORM
  */
-static INLINE boolean
+static inline boolean
 util_format_is_rgba8_variant(const struct util_format_description *desc)
 {
    unsigned chan;
@@ -737,7 +737,7 @@ util_format_is_rgba8_variant(const struct util_format_description *desc)
 /**
  * Return total bits needed for the pixel format per block.
  */
-static INLINE uint
+static inline uint
 util_format_get_blocksizebits(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -753,7 +753,7 @@ util_format_get_blocksizebits(enum pipe_format format)
 /**
  * Return bytes per block (not pixel) for the given format.
  */
-static INLINE uint
+static inline uint
 util_format_get_blocksize(enum pipe_format format)
 {
    uint bits = util_format_get_blocksizebits(format);
@@ -768,7 +768,7 @@ util_format_get_blocksize(enum pipe_format format)
    return bytes;
 }
 
-static INLINE uint
+static inline uint
 util_format_get_blockwidth(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -781,7 +781,7 @@ util_format_get_blockwidth(enum pipe_format format)
    return desc->block.width;
 }
 
-static INLINE uint
+static inline uint
 util_format_get_blockheight(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -794,7 +794,7 @@ util_format_get_blockheight(enum pipe_format format)
    return desc->block.height;
 }
 
-static INLINE unsigned
+static inline unsigned
 util_format_get_nblocksx(enum pipe_format format,
                          unsigned x)
 {
@@ -802,7 +802,7 @@ util_format_get_nblocksx(enum pipe_format format,
    return (x + blockwidth - 1) / blockwidth;
 }
 
-static INLINE unsigned
+static inline unsigned
 util_format_get_nblocksy(enum pipe_format format,
                          unsigned y)
 {
@@ -810,7 +810,7 @@ util_format_get_nblocksy(enum pipe_format format,
    return (y + blockheight - 1) / blockheight;
 }
 
-static INLINE unsigned
+static inline unsigned
 util_format_get_nblocks(enum pipe_format format,
                         unsigned width,
                         unsigned height)
@@ -818,14 +818,14 @@ util_format_get_nblocks(enum pipe_format format,
    return util_format_get_nblocksx(format, width) * util_format_get_nblocksy(format, height);
 }
 
-static INLINE size_t
+static inline size_t
 util_format_get_stride(enum pipe_format format,
                        unsigned width)
 {
    return util_format_get_nblocksx(format, width) * util_format_get_blocksize(format);
 }
 
-static INLINE size_t
+static inline size_t
 util_format_get_2d_size(enum pipe_format format,
                         size_t stride,
                         unsigned height)
@@ -833,7 +833,7 @@ util_format_get_2d_size(enum pipe_format format,
    return util_format_get_nblocksy(format, height) * stride;
 }
 
-static INLINE uint
+static inline uint
 util_format_get_component_bits(enum pipe_format format,
                                enum util_format_colorspace colorspace,
                                uint component)
@@ -880,7 +880,7 @@ util_format_get_component_bits(enum pipe_format format,
  * Given a linear RGB colorspace format, return the corresponding SRGB
  * format, or PIPE_FORMAT_NONE if none.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_srgb(enum pipe_format format)
 {
    if (util_format_is_srgb(format))
@@ -930,7 +930,7 @@ util_format_srgb(enum pipe_format format)
  * Given an sRGB format, return the corresponding linear colorspace format.
  * For non sRGB formats, return the format unchanged.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_linear(enum pipe_format format)
 {
    switch (format) {
@@ -977,7 +977,7 @@ util_format_linear(enum pipe_format format)
  * Given a depth-stencil format, return the corresponding stencil-only format.
  * For stencil-only formats, return the format unchanged.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_stencil_only(enum pipe_format format)
 {
    switch (format) {
@@ -1006,7 +1006,7 @@ util_format_stencil_only(enum pipe_format format)
  * Converts PIPE_FORMAT_*I* to PIPE_FORMAT_*R*.
  * This is identity for non-intensity formats.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_intensity_to_red(enum pipe_format format)
 {
    switch (format) {
@@ -1044,7 +1044,7 @@ util_format_intensity_to_red(enum pipe_format format)
  * Converts PIPE_FORMAT_*L* to PIPE_FORMAT_*R*.
  * This is identity for non-luminance formats.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 util_format_luminance_to_red(enum pipe_format format)
 {
    switch (format) {
@@ -1122,7 +1122,7 @@ util_format_luminance_to_red(enum pipe_format format)
  * Return the number of components stored.
  * Formats with block size != 1x1 will always have 1 component (the block).
  */
-static INLINE unsigned
+static inline unsigned
 util_format_get_nr_components(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -1133,7 +1133,7 @@ util_format_get_nr_components(enum pipe_format format)
  * Return the index of the first non-void channel
  * -1 if no non-void channels
  */
-static INLINE int
+static inline int
 util_format_get_first_non_void_channel(enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py
index d5138cc0577..fb42de723c4 100644
--- a/src/gallium/auxiliary/util/u_format_pack.py
+++ b/src/gallium/auxiliary/util/u_format_pack.py
@@ -616,7 +616,7 @@ def generate_format_unpack(format, dst_channel, dst_native_type, dst_suffix):
 
     name = format.short_name()
 
-    print 'static INLINE void'
+    print 'static inline void'
     print 'util_format_%s_unpack_%s(%s *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, dst_suffix, dst_native_type)
     print '{'
 
@@ -645,7 +645,7 @@ def generate_format_pack(format, src_channel, src_native_type, src_suffix):
 
     name = format.short_name()
 
-    print 'static INLINE void'
+    print 'static inline void'
     print 'util_format_%s_pack_%s(uint8_t *dst_row, unsigned dst_stride, const %s *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, src_suffix, src_native_type)
     print '{'
     
@@ -674,7 +674,7 @@ def generate_format_fetch(format, dst_channel, dst_native_type, dst_suffix):
 
     name = format.short_name()
 
-    print 'static INLINE void'
+    print 'static inline void'
     print 'util_format_%s_fetch_%s(%s *dst, const uint8_t *src, unsigned i, unsigned j)' % (name, dst_suffix, dst_native_type)
     print '{'
 
diff --git a/src/gallium/auxiliary/util/u_format_r11g11b10f.h b/src/gallium/auxiliary/util/u_format_r11g11b10f.h
index 57516c39c6e..218822b16e6 100644
--- a/src/gallium/auxiliary/util/u_format_r11g11b10f.h
+++ b/src/gallium/auxiliary/util/u_format_r11g11b10f.h
@@ -45,7 +45,7 @@
 
 #define F32_INFINITY         0x7f800000
 
-static INLINE unsigned f32_to_uf11(float val)
+static inline unsigned f32_to_uf11(float val)
 {
    union {
       float f;
@@ -94,7 +94,7 @@ static INLINE unsigned f32_to_uf11(float val)
    return uf11;
 }
 
-static INLINE float uf11_to_f32(uint16_t val)
+static inline float uf11_to_f32(uint16_t val)
 {
    union {
       float f;
@@ -131,7 +131,7 @@ static INLINE float uf11_to_f32(uint16_t val)
    return f32.f;
 }
 
-static INLINE unsigned f32_to_uf10(float val)
+static inline unsigned f32_to_uf10(float val)
 {
    union {
       float f;
@@ -180,7 +180,7 @@ static INLINE unsigned f32_to_uf10(float val)
    return uf10;
 }
 
-static INLINE float uf10_to_f32(uint16_t val)
+static inline float uf10_to_f32(uint16_t val)
 {
    union {
       float f;
@@ -217,14 +217,14 @@ static INLINE float uf10_to_f32(uint16_t val)
    return f32.f;
 }
 
-static INLINE unsigned float3_to_r11g11b10f(const float rgb[3])
+static inline unsigned float3_to_r11g11b10f(const float rgb[3])
 {
    return ( f32_to_uf11(rgb[0]) & 0x7ff) |
           ((f32_to_uf11(rgb[1]) & 0x7ff) << 11) |
           ((f32_to_uf10(rgb[2]) & 0x3ff) << 22);
 }
 
-static INLINE void r11g11b10f_to_float3(unsigned rgb, float retval[3])
+static inline void r11g11b10f_to_float3(unsigned rgb, float retval[3])
 {
    retval[0] = uf11_to_f32( rgb        & 0x7ff);
    retval[1] = uf11_to_f32((rgb >> 11) & 0x7ff);
diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index c2a3f6f3e9d..59fc291e917 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -26,9 +26,10 @@
 #ifndef RGB9E5_H
 #define RGB9E5_H
 
-#include <math.h>
 #include <assert.h>
 
+#include "c99_math.h"
+
 #define RGB9E5_EXPONENT_BITS          5
 #define RGB9E5_MANTISSA_BITS          9
 #define RGB9E5_EXP_BIAS               15
@@ -73,9 +74,9 @@ typedef union {
    } field;
 } rgb9e5;
 
-static INLINE float rgb9e5_ClampRange(float x)
+static inline float rgb9e5_ClampRange(float x)
 {
-   if (x > 0.0) {
+   if (x > 0.0f) {
       if (x >= MAX_RGB9E5) {
          return MAX_RGB9E5;
       } else {
@@ -90,7 +91,7 @@ static INLINE float rgb9e5_ClampRange(float x)
 /* Ok, FloorLog2 is not correct for the denorm and zero values, but we
    are going to do a max of this value with the minimum rgb9e5 exponent
    that will hide these problem cases. */
-static INLINE int rgb9e5_FloorLog2(float x)
+static inline int rgb9e5_FloorLog2(float x)
 {
    float754 f;
 
@@ -98,7 +99,7 @@ static INLINE int rgb9e5_FloorLog2(float x)
    return (f.field.biasedexponent - 127);
 }
 
-static INLINE unsigned float3_to_rgb9e5(const float rgb[3])
+static inline unsigned float3_to_rgb9e5(const float rgb[3])
 {
    rgb9e5 retval;
    float maxrgb;
@@ -115,8 +116,8 @@ static INLINE unsigned float3_to_rgb9e5(const float rgb[3])
    exp_shared = MAX2(-RGB9E5_EXP_BIAS-1, rgb9e5_FloorLog2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
    assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
    assert(exp_shared >= 0);
-   /* This pow function could be replaced by a table. */
-   denom = pow(2, exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
+   /* This exp2 function could be replaced by a table. */
+   denom = exp2(exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
 
    maxm = (int) floor(maxrgb / denom + 0.5);
    if (maxm == MAX_RGB9E5_MANTISSA+1) {
@@ -146,7 +147,7 @@ static INLINE unsigned float3_to_rgb9e5(const float rgb[3])
    return retval.raw;
 }
 
-static INLINE void rgb9e5_to_float3(unsigned rgb, float retval[3])
+static inline void rgb9e5_to_float3(unsigned rgb, float retval[3])
 {
    rgb9e5 v;
    int exponent;
@@ -154,7 +155,7 @@ static INLINE void rgb9e5_to_float3(unsigned rgb, float retval[3])
 
    v.raw = rgb;
    exponent = v.field.biasedexponent - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS;
-   scale = (float) pow(2, exponent);
+   scale = exp2f(exponent);
 
    retval[0] = v.field.r * scale;
    retval[1] = v.field.g * scale;
diff --git a/src/gallium/auxiliary/util/u_format_s3tc.c b/src/gallium/auxiliary/util/u_format_s3tc.c
index 7e05989e6a1..cd3e165d3f0 100644
--- a/src/gallium/auxiliary/util/u_format_s3tc.c
+++ b/src/gallium/auxiliary/util/u_format_s3tc.c
@@ -235,7 +235,7 @@ util_format_dxt5_rgba_fetch_rgba_float(float *dst, const uint8_t *src, unsigned
  * Block decompression.
  */
 
-static INLINE void
+static inline void
 util_format_dxtn_rgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
                                         const uint8_t *src_row, unsigned src_stride,
                                         unsigned width, unsigned height,
@@ -312,7 +312,7 @@ util_format_dxt5_rgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
                                            16, FALSE);
 }
 
-static INLINE void
+static inline void
 util_format_dxtn_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride,
                                        const uint8_t *src_row, unsigned src_stride,
                                        unsigned width, unsigned height,
@@ -400,7 +400,7 @@ util_format_dxt5_rgba_unpack_rgba_float(float *dst_row, unsigned dst_stride,
  * Block compression.
  */
 
-static INLINE void
+static inline void
 util_format_dxtn_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
                                   const uint8_t *src, unsigned src_stride,
                                   unsigned width, unsigned height,
@@ -478,7 +478,7 @@ util_format_dxt5_rgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
                                      16, FALSE);
 }
 
-static INLINE void
+static inline void
 util_format_dxtn_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
                                  const float *src, unsigned src_stride,
                                  unsigned width, unsigned height,
diff --git a/src/gallium/auxiliary/util/u_format_yuv.h b/src/gallium/auxiliary/util/u_format_yuv.h
index 4ec39812e47..41524d63f3a 100644
--- a/src/gallium/auxiliary/util/u_format_yuv.h
+++ b/src/gallium/auxiliary/util/u_format_yuv.h
@@ -54,7 +54,7 @@
  * precision in the coefficients.
  */
 
-static INLINE void
+static inline void
 util_format_rgb_float_to_yuv(float r, float g, float b,
                              uint8_t *y, uint8_t *u, uint8_t *v)
 {
@@ -74,7 +74,7 @@ util_format_rgb_float_to_yuv(float r, float g, float b,
 }
 
 
-static INLINE void
+static inline void
 util_format_yuv_to_rgb_float(uint8_t y, uint8_t u, uint8_t v,
                              float *r, float *g, float *b)
 {
@@ -92,7 +92,7 @@ util_format_yuv_to_rgb_float(uint8_t y, uint8_t u, uint8_t v,
 }
 
 
-static INLINE void
+static inline void
 util_format_rgb_8unorm_to_yuv(uint8_t r, uint8_t g, uint8_t b,
                 	      uint8_t *y, uint8_t *u, uint8_t *v)
 {
@@ -102,7 +102,7 @@ util_format_rgb_8unorm_to_yuv(uint8_t r, uint8_t g, uint8_t b,
 }
 
 
-static INLINE void
+static inline void
 util_format_yuv_to_rgb_8unorm(uint8_t y, uint8_t u, uint8_t v,
                               uint8_t *r, uint8_t *g, uint8_t *b)
 {
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
index f1ed32f1d5c..69f2f2971f7 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -35,28 +35,28 @@
  * z32_unorm conversion functions
  */
 
-static INLINE uint16_t
+static inline uint16_t
 z32_unorm_to_z16_unorm(uint32_t z)
 {
    /* z * 0xffff / 0xffffffff */
    return z >> 16;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z16_unorm_to_z32_unorm(uint16_t z)
 {
    /* z * 0xffffffff / 0xffff */
    return (z << 16) | z;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z32_unorm_to_z24_unorm(uint32_t z)
 {
    /* z * 0xffffff / 0xffffffff */
    return z >> 8;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z24_unorm_to_z32_unorm(uint32_t z)
 {
    /* z * 0xffffffff / 0xffffff */
@@ -68,42 +68,42 @@ z24_unorm_to_z32_unorm(uint32_t z)
  * z32_float conversion functions
  */
 
-static INLINE uint16_t
+static inline uint16_t
 z32_float_to_z16_unorm(float z)
 {
    const float scale = 0xffff;
    return (uint16_t)(z * scale + 0.5f);
 }
 
-static INLINE float
+static inline float
 z16_unorm_to_z32_float(uint16_t z)
 {
    const float scale = 1.0 / 0xffff;
    return (float)(z * scale);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z32_float_to_z24_unorm(float z)
 {
    const double scale = 0xffffff;
    return (uint32_t)(z * scale) & 0xffffff;
 }
 
-static INLINE float
+static inline float
 z24_unorm_to_z32_float(uint32_t z)
 {
    const double scale = 1.0 / 0xffffff;
    return (float)(z * scale);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 z32_float_to_z32_unorm(float z)
 {
    const double scale = 0xffffffff;
    return (uint32_t)(z * scale);
 }
 
-static INLINE float
+static inline float
 z32_unorm_to_z32_float(uint32_t z)
 {
    const double scale = 1.0 / 0xffffffff;
diff --git a/src/gallium/auxiliary/util/u_half.h b/src/gallium/auxiliary/util/u_half.h
index d340b9a7aef..d28fae3c77d 100644
--- a/src/gallium/auxiliary/util/u_half.h
+++ b/src/gallium/auxiliary/util/u_half.h
@@ -43,7 +43,7 @@ extern "C" {
  *  https://gist.github.com/2144712
  */
 
-static INLINE uint16_t
+static inline uint16_t
 util_float_to_half(float f)
 {
    uint32_t sign_mask  = 0x80000000;
@@ -96,7 +96,7 @@ util_float_to_half(float f)
    return f16;
 }
 
-static INLINE float
+static inline float
 util_half_to_float(uint16_t f16)
 {
    union fi infnan;
diff --git a/src/gallium/auxiliary/util/u_handle_table.c b/src/gallium/auxiliary/util/u_handle_table.c
index 85302f1e194..42c4e44b644 100644
--- a/src/gallium/auxiliary/util/u_handle_table.c
+++ b/src/gallium/auxiliary/util/u_handle_table.c
@@ -96,7 +96,7 @@ handle_table_set_destroy(struct handle_table *ht,
 /**
  * Resize the table if necessary 
  */
-static INLINE int
+static inline int
 handle_table_resize(struct handle_table *ht,
                     unsigned minimum_size)
 {
@@ -126,7 +126,7 @@ handle_table_resize(struct handle_table *ht,
 }
 
 
-static INLINE void
+static inline void
 handle_table_clear(struct handle_table *ht, 
                    unsigned index)
 {
diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c
index 06c8b5c91a5..a505fbc4d83 100644
--- a/src/gallium/auxiliary/util/u_hash_table.c
+++ b/src/gallium/auxiliary/util/u_hash_table.c
@@ -68,7 +68,7 @@ struct util_hash_table_item
 };
 
 
-static INLINE struct util_hash_table_item *
+static inline struct util_hash_table_item *
 util_hash_table_item(struct cso_hash_iter iter)
 {
    return (struct util_hash_table_item *)cso_hash_iter_data(iter);
@@ -98,7 +98,7 @@ util_hash_table_create(unsigned (*hash)(void *key),
 }
 
 
-static INLINE struct cso_hash_iter
+static inline struct cso_hash_iter
 util_hash_table_find_iter(struct util_hash_table *ht,
                           void *key,
                           unsigned key_hash)
@@ -118,7 +118,7 @@ util_hash_table_find_iter(struct util_hash_table *ht,
 }
 
 
-static INLINE struct util_hash_table_item *
+static inline struct util_hash_table_item *
 util_hash_table_find_item(struct util_hash_table *ht,
                           void *key,
                           unsigned key_hash)
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index 95401621ec3..bb99a02ce49 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -51,13 +51,13 @@ extern "C" {
  */
 
 
-static INLINE void
+static inline void
 pipe_reference_init(struct pipe_reference *reference, unsigned count)
 {
    p_atomic_set(&reference->count, count);
 }
 
-static INLINE boolean
+static inline boolean
 pipe_is_referenced(struct pipe_reference *reference)
 {
    return p_atomic_read(&reference->count) != 0;
@@ -69,7 +69,7 @@ pipe_is_referenced(struct pipe_reference *reference)
  * Both 'ptr' and 'reference' may be NULL.
  * \return TRUE if the object's refcount hits zero and should be destroyed.
  */
-static INLINE boolean
+static inline boolean
 pipe_reference_described(struct pipe_reference *ptr, 
                          struct pipe_reference *reference, 
                          debug_reference_descriptor get_desc)
@@ -96,14 +96,14 @@ pipe_reference_described(struct pipe_reference *ptr,
    return destroy;
 }
 
-static INLINE boolean
+static inline boolean
 pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
 {
    return pipe_reference_described(ptr, reference, 
                                    (debug_reference_descriptor)debug_describe_reference);
 }
 
-static INLINE void
+static inline void
 pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
 {
    struct pipe_surface *old_surf = *ptr;
@@ -120,7 +120,7 @@ pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
  * of using a deleted context's surface_destroy() method when freeing a surface
  * that's shared by multiple contexts.
  */
-static INLINE void
+static inline void
 pipe_surface_release(struct pipe_context *pipe, struct pipe_surface **ptr)
 {
    if (pipe_reference_described(&(*ptr)->reference, NULL,
@@ -130,7 +130,7 @@ pipe_surface_release(struct pipe_context *pipe, struct pipe_surface **ptr)
 }
 
 
-static INLINE void
+static inline void
 pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex)
 {
    struct pipe_resource *old_tex = *ptr;
@@ -141,7 +141,7 @@ pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex)
    *ptr = tex;
 }
 
-static INLINE void
+static inline void
 pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view)
 {
    struct pipe_sampler_view *old_view = *ptr;
@@ -158,7 +158,7 @@ pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_
  * work-around for fixing a dangling context pointer problem when textures
  * are shared by multiple contexts.  XXX fix this someday.
  */
-static INLINE void
+static inline void
 pipe_sampler_view_release(struct pipe_context *ctx,
                           struct pipe_sampler_view **ptr)
 {
@@ -173,8 +173,18 @@ pipe_sampler_view_release(struct pipe_context *ctx,
    *ptr = NULL;
 }
 
+static inline void
+pipe_image_view_reference(struct pipe_image_view **ptr, struct pipe_image_view *view)
+{
+   struct pipe_image_view *old_view = *ptr;
 
-static INLINE void
+   if (pipe_reference_described(&(*ptr)->reference, &view->reference,
+                                (debug_reference_descriptor)debug_describe_image_view))
+      old_view->context->image_view_destroy(old_view->context, old_view);
+   *ptr = view;
+}
+
+static inline void
 pipe_so_target_reference(struct pipe_stream_output_target **ptr,
                          struct pipe_stream_output_target *target)
 {
@@ -186,7 +196,7 @@ pipe_so_target_reference(struct pipe_stream_output_target **ptr,
    *ptr = target;
 }
 
-static INLINE void
+static inline void
 pipe_surface_reset(struct pipe_context *ctx, struct pipe_surface* ps,
                    struct pipe_resource *pt, unsigned level, unsigned layer)
 {
@@ -199,7 +209,7 @@ pipe_surface_reset(struct pipe_context *ctx, struct pipe_surface* ps,
    ps->context = ctx;
 }
 
-static INLINE void
+static inline void
 pipe_surface_init(struct pipe_context *ctx, struct pipe_surface* ps,
                   struct pipe_resource *pt, unsigned level, unsigned layer)
 {
@@ -209,7 +219,7 @@ pipe_surface_init(struct pipe_context *ctx, struct pipe_surface* ps,
 }
 
 /* Return true if the surfaces are equal. */
-static INLINE boolean
+static inline boolean
 pipe_surface_equal(struct pipe_surface *s1, struct pipe_surface *s2)
 {
    return s1->texture == s2->texture &&
@@ -233,7 +243,7 @@ pipe_surface_equal(struct pipe_surface *s1, struct pipe_surface *s2)
  * \param bind  bitmask of PIPE_BIND_x flags
  * \param usage  bitmask of PIPE_USAGE_x flags
  */
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 pipe_buffer_create( struct pipe_screen *screen,
 		    unsigned bind,
 		    unsigned usage,
@@ -261,7 +271,7 @@ pipe_buffer_create( struct pipe_screen *screen,
  * \param access  bitmask of PIPE_TRANSFER_x flags
  * \param transfer  returns a transfer object
  */
-static INLINE void *
+static inline void *
 pipe_buffer_map_range(struct pipe_context *pipe,
 		      struct pipe_resource *buffer,
 		      unsigned offset,
@@ -292,7 +302,7 @@ pipe_buffer_map_range(struct pipe_context *pipe,
  * \param access  bitmask of PIPE_TRANSFER_x flags
  * \param transfer  returns a transfer object
  */
-static INLINE void *
+static inline void *
 pipe_buffer_map(struct pipe_context *pipe,
                 struct pipe_resource *buffer,
                 unsigned access,
@@ -302,14 +312,14 @@ pipe_buffer_map(struct pipe_context *pipe,
 }
 
 
-static INLINE void
+static inline void
 pipe_buffer_unmap(struct pipe_context *pipe,
                   struct pipe_transfer *transfer)
 {
    pipe->transfer_unmap(pipe, transfer);
 }
 
-static INLINE void
+static inline void
 pipe_buffer_flush_mapped_range(struct pipe_context *pipe,
                                struct pipe_transfer *transfer,
                                unsigned offset,
@@ -333,7 +343,7 @@ pipe_buffer_flush_mapped_range(struct pipe_context *pipe,
    pipe->transfer_flush_region(pipe, transfer, &box);
 }
 
-static INLINE void
+static inline void
 pipe_buffer_write(struct pipe_context *pipe,
                   struct pipe_resource *buf,
                   unsigned offset,
@@ -367,7 +377,7 @@ pipe_buffer_write(struct pipe_context *pipe,
  * We can avoid GPU/CPU synchronization when writing range that has never
  * been written before.
  */
-static INLINE void
+static inline void
 pipe_buffer_write_nooverlap(struct pipe_context *pipe,
                             struct pipe_resource *buf,
                             unsigned offset, unsigned size,
@@ -393,7 +403,7 @@ pipe_buffer_write_nooverlap(struct pipe_context *pipe,
  * \param bind  bitmask of PIPE_BIND_x flags
  * \param usage  bitmask of PIPE_USAGE_x flags
  */
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 pipe_buffer_create_with_data(struct pipe_context *pipe,
                              unsigned bind,
                              unsigned usage,
@@ -406,7 +416,7 @@ pipe_buffer_create_with_data(struct pipe_context *pipe,
    return res;
 }
 
-static INLINE void
+static inline void
 pipe_buffer_read(struct pipe_context *pipe,
                  struct pipe_resource *buf,
                  unsigned offset,
@@ -433,7 +443,7 @@ pipe_buffer_read(struct pipe_context *pipe,
  * Map a resource for reading/writing.
  * \param access  bitmask of PIPE_TRANSFER_x flags
  */
-static INLINE void *
+static inline void *
 pipe_transfer_map(struct pipe_context *context,
                   struct pipe_resource *resource,
                   unsigned level, unsigned layer,
@@ -456,7 +466,7 @@ pipe_transfer_map(struct pipe_context *context,
  * Map a 3D (texture) resource for reading/writing.
  * \param access  bitmask of PIPE_TRANSFER_x flags
  */
-static INLINE void *
+static inline void *
 pipe_transfer_map_3d(struct pipe_context *context,
                      struct pipe_resource *resource,
                      unsigned level,
@@ -474,14 +484,14 @@ pipe_transfer_map_3d(struct pipe_context *context,
                                 &box, transfer);
 }
 
-static INLINE void
+static inline void
 pipe_transfer_unmap( struct pipe_context *context,
                      struct pipe_transfer *transfer )
 {
    context->transfer_unmap( context, transfer );
 }
 
-static INLINE void
+static inline void
 pipe_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
                          struct pipe_resource *buf)
 {
@@ -502,7 +512,7 @@ pipe_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
  * Get the polygon offset enable/disable flag for the given polygon fill mode.
  * \param fill_mode  one of PIPE_POLYGON_MODE_POINT/LINE/FILL
  */
-static INLINE boolean
+static inline boolean
 util_get_offset(const struct pipe_rasterizer_state *templ,
                 unsigned fill_mode)
 {
@@ -519,7 +529,7 @@ util_get_offset(const struct pipe_rasterizer_state *templ,
    }
 }
 
-static INLINE float
+static inline float
 util_get_min_point_size(const struct pipe_rasterizer_state *state)
 {
    /* The point size should be clamped to this value at the rasterizer stage.
@@ -529,7 +539,7 @@ util_get_min_point_size(const struct pipe_rasterizer_state *state)
           !state->multisample ? 1.0f : 0.0f;
 }
 
-static INLINE void
+static inline void
 util_query_clear_result(union pipe_query_result *result, unsigned type)
 {
    switch (type) {
@@ -560,7 +570,7 @@ util_query_clear_result(union pipe_query_result *result, unsigned type)
 }
 
 /** Convert PIPE_TEXTURE_x to TGSI_TEXTURE_x */
-static INLINE unsigned
+static inline unsigned
 util_pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target,
                           unsigned nr_samples)
 {
@@ -605,7 +615,7 @@ util_pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target,
 }
 
 
-static INLINE void
+static inline void
 util_copy_constant_buffer(struct pipe_constant_buffer *dst,
                           const struct pipe_constant_buffer *src)
 {
@@ -623,7 +633,7 @@ util_copy_constant_buffer(struct pipe_constant_buffer *dst,
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 util_max_layer(const struct pipe_resource *r, unsigned level)
 {
    switch (r->target) {
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
index ae14eda3cec..daa2991ced6 100644
--- a/src/gallium/auxiliary/util/u_keymap.c
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -71,7 +71,7 @@ default_delete_func(const struct keymap *map,
 }
 
 
-static INLINE struct keymap_item *
+static inline struct keymap_item *
 hash_table_item(struct cso_hash_iter iter)
 {
    return (struct keymap_item *) cso_hash_iter_data(iter);
@@ -143,7 +143,7 @@ util_delete_keymap(struct keymap *map, void *user)
 }
 
 
-static INLINE struct cso_hash_iter
+static inline struct cso_hash_iter
 hash_table_find_iter(const struct keymap *map, const void *key,
                      unsigned key_hash)
 {
@@ -162,7 +162,7 @@ hash_table_find_iter(const struct keymap *map, const void *key,
 }
 
 
-static INLINE struct keymap_item *
+static inline struct keymap_item *
 hash_table_find_item(const struct keymap *map, const void *key,
                      unsigned key_hash)
 {
diff --git a/src/gallium/auxiliary/util/u_linear.h b/src/gallium/auxiliary/util/u_linear.h
index 81ffc9fb27d..87e52a344d4 100644
--- a/src/gallium/auxiliary/util/u_linear.h
+++ b/src/gallium/auxiliary/util/u_linear.h
@@ -89,7 +89,7 @@ void pipe_linear_fill_info(struct pipe_tile_info *t,
 			   unsigned tile_width, unsigned tile_height,
 			   unsigned tiles_x, unsigned tiles_y);
 
-static INLINE boolean pipe_linear_check_tile(const struct pipe_tile_info *t)
+static inline boolean pipe_linear_check_tile(const struct pipe_tile_info *t)
 {
    if (t->tile.size != t->block.size * t->cols * t->rows)
       return FALSE;
diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
index ae9e9513b04..c58af911be7 100644
--- a/src/gallium/auxiliary/util/u_math.c
+++ b/src/gallium/auxiliary/util/u_math.c
@@ -48,7 +48,7 @@ init_pow2_table(void)
 {
    int i;
    for (i = 0; i < POW2_TABLE_SIZE; i++)
-      pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+      pow2_table[i] = exp2f((i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
 }
 
 
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 3b4040f0ee2..56bd185f527 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -92,7 +92,7 @@ union di {
 /**
  * Extract the IEEE float32 exponent.
  */
-static INLINE signed
+static inline signed
 util_get_float32_exponent(float x)
 {
    union fi f;
@@ -112,7 +112,7 @@ util_get_float32_exponent(float x)
  * Compute exp2(ipart) with i << ipart
  * Compute exp2(fpart) with lookup table.
  */
-static INLINE float
+static inline float
 util_fast_exp2(float x)
 {
    int32_t ipart;
@@ -143,7 +143,7 @@ util_fast_exp2(float x)
 /**
  * Fast approximation to exp(x).
  */
-static INLINE float
+static inline float
 util_fast_exp(float x)
 {
    const float k = 1.44269f; /* = log2(e) */
@@ -160,7 +160,7 @@ extern float log2_table[LOG2_TABLE_SIZE];
 /**
  * Fast approximation to log2(x).
  */
-static INLINE float
+static inline float
 util_fast_log2(float x)
 {
    union fi num;
@@ -176,7 +176,7 @@ util_fast_log2(float x)
 /**
  * Fast approximation to x^y.
  */
-static INLINE float
+static inline float
 util_fast_pow(float x, float y)
 {
    return util_fast_exp2(util_fast_log2(x) * y);
@@ -184,7 +184,7 @@ util_fast_pow(float x, float y)
 
 /* Note that this counts zero as a power of two.
  */
-static INLINE boolean
+static inline boolean
 util_is_power_of_two( unsigned v )
 {
    return (v & (v-1)) == 0;
@@ -194,7 +194,7 @@ util_is_power_of_two( unsigned v )
 /**
  * Floor(x), returned as int.
  */
-static INLINE int
+static inline int
 util_ifloor(float f)
 {
    int ai, bi;
@@ -211,7 +211,7 @@ util_ifloor(float f)
 /**
  * Round float to nearest int.
  */
-static INLINE int
+static inline int
 util_iround(float f)
 {
 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) 
@@ -237,10 +237,10 @@ util_iround(float f)
 /**
  * Approximate floating point comparison
  */
-static INLINE boolean
+static inline boolean
 util_is_approx(float a, float b, float tol)
 {
-   return fabs(b - a) <= tol;
+   return fabsf(b - a) <= tol;
 }
 
 
@@ -256,7 +256,7 @@ util_is_approx(float a, float b, float tol)
 /**
  * Single-float
  */
-static INLINE boolean
+static inline boolean
 util_is_inf_or_nan(float x)
 {
    union fi tmp;
@@ -265,7 +265,7 @@ util_is_inf_or_nan(float x)
 }
 
 
-static INLINE boolean
+static inline boolean
 util_is_nan(float x)
 {
    union fi tmp;
@@ -274,7 +274,7 @@ util_is_nan(float x)
 }
 
 
-static INLINE int
+static inline int
 util_inf_sign(float x)
 {
    union fi tmp;
@@ -290,7 +290,7 @@ util_inf_sign(float x)
 /**
  * Double-float
  */
-static INLINE boolean
+static inline boolean
 util_is_double_inf_or_nan(double x)
 {
    union di tmp;
@@ -299,7 +299,7 @@ util_is_double_inf_or_nan(double x)
 }
 
 
-static INLINE boolean
+static inline boolean
 util_is_double_nan(double x)
 {
    union di tmp;
@@ -308,7 +308,7 @@ util_is_double_nan(double x)
 }
 
 
-static INLINE int
+static inline int
 util_double_inf_sign(double x)
 {
    union di tmp;
@@ -324,21 +324,21 @@ util_double_inf_sign(double x)
 /**
  * Half-float
  */
-static INLINE boolean
+static inline boolean
 util_is_half_inf_or_nan(int16_t x)
 {
    return (x & 0x7c00) == 0x7c00;
 }
 
 
-static INLINE boolean
+static inline boolean
 util_is_half_nan(int16_t x)
 {
    return (x & 0x7fff) > 0x7c00;
 }
 
 
-static INLINE int
+static inline int
 util_half_inf_sign(int16_t x)
 {
    if ((x & 0x7fff) != 0x7c00) {
@@ -359,7 +359,7 @@ util_half_inf_sign(int16_t x)
 #if defined(_MSC_VER) && (_M_IX86 || _M_AMD64 || _M_IA64)
 unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask);
 #pragma intrinsic(_BitScanForward)
-static INLINE
+static inline
 unsigned long ffs( unsigned long u )
 {
    unsigned long i;
@@ -369,7 +369,7 @@ unsigned long ffs( unsigned long u )
       return 0;
 }
 #elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
-static INLINE
+static inline
 unsigned ffs( unsigned u )
 {
    unsigned i;
@@ -409,7 +409,7 @@ unsigned ffs( unsigned u )
  * Find last bit set in a word.  The least significant bit is 1.
  * Return 0 if no bits are set.
  */
-static INLINE unsigned
+static inline unsigned
 util_last_bit(unsigned u)
 {
 #if defined(HAVE___BUILTIN_CLZ)
@@ -428,7 +428,7 @@ util_last_bit(unsigned u)
  * Find last bit set in a word.  The least significant bit is 1.
  * Return 0 if no bits are set.
  */
-static INLINE unsigned
+static inline unsigned
 util_last_bit64(uint64_t u)
 {
 #if defined(HAVE___BUILTIN_CLZLL)
@@ -448,7 +448,7 @@ util_last_bit64(uint64_t u)
  * significant bit is 1.
  * Return 0 if no bits are set.
  */
-static INLINE unsigned
+static inline unsigned
 util_last_bit_signed(int i)
 {
    if (i >= 0)
@@ -465,7 +465,7 @@ util_last_bit_signed(int i)
  * }
  *
  */
-static INLINE int
+static inline int
 u_bit_scan(unsigned *mask)
 {
    int i = ffs(*mask) - 1;
@@ -474,7 +474,7 @@ u_bit_scan(unsigned *mask)
 }
 
 #ifndef _MSC_VER
-static INLINE int
+static inline int
 u_bit_scan64(uint64_t *mask)
 {
    int i = ffsll(*mask) - 1;
@@ -486,7 +486,7 @@ u_bit_scan64(uint64_t *mask)
 /**
  * Return float bits.
  */
-static INLINE unsigned
+static inline unsigned
 fui( float f )
 {
    union fi fi;
@@ -494,7 +494,7 @@ fui( float f )
    return fi.ui;
 }
 
-static INLINE float
+static inline float
 uif(uint32_t ui)
 {
    union fi fi;
@@ -507,7 +507,7 @@ uif(uint32_t ui)
  * Convert ubyte to float in [0, 1].
  * XXX a 256-entry lookup table would be slightly faster.
  */
-static INLINE float
+static inline float
 ubyte_to_float(ubyte ub)
 {
    return (float) ub * (1.0f / 255.0f);
@@ -517,7 +517,7 @@ ubyte_to_float(ubyte ub)
 /**
  * Convert float in [0,1] to ubyte in [0,255] with clamping.
  */
-static INLINE ubyte
+static inline ubyte
 float_to_ubyte(float f)
 {
    union fi tmp;
@@ -535,13 +535,13 @@ float_to_ubyte(float f)
    }
 }
 
-static INLINE float
+static inline float
 byte_to_float_tex(int8_t b)
 {
    return (b == -128) ? -1.0F : b * 1.0F / 127.0F;
 }
 
-static INLINE int8_t
+static inline int8_t
 float_to_byte_tex(float f)
 {
    return (int8_t) (127.0F * f);
@@ -550,7 +550,7 @@ float_to_byte_tex(float f)
 /**
  * Calc log base 2
  */
-static INLINE unsigned
+static inline unsigned
 util_logbase2(unsigned n)
 {
 #if defined(HAVE___BUILTIN_CLZ)
@@ -570,7 +570,7 @@ util_logbase2(unsigned n)
 /**
  * Returns the smallest power of two >= x
  */
-static INLINE unsigned
+static inline unsigned
 util_next_power_of_two(unsigned x)
 {
 #if defined(HAVE___BUILTIN_CLZ)
@@ -602,7 +602,7 @@ util_next_power_of_two(unsigned x)
 /**
  * Return number of bits set in n.
  */
-static INLINE unsigned
+static inline unsigned
 util_bitcount(unsigned n)
 {
 #if defined(HAVE___BUILTIN_POPCOUNT)
@@ -623,7 +623,7 @@ util_bitcount(unsigned n)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 util_bitcount64(uint64_t n)
 {
 #ifdef HAVE___BUILTIN_POPCOUNTLL
@@ -639,7 +639,7 @@ util_bitcount64(uint64_t n)
  * Algorithm taken from:
  * http://stackoverflow.com/questions/9144800/c-reverse-bits-in-unsigned-integer
  */
-static INLINE unsigned
+static inline unsigned
 util_bitreverse(unsigned n)
 {
     n = ((n >> 1) & 0x55555555u) | ((n & 0x55555555u) << 1);
@@ -671,7 +671,7 @@ util_bitreverse(unsigned n)
 /**
  * Reverse byte order of a 32 bit word.
  */
-static INLINE uint32_t
+static inline uint32_t
 util_bswap32(uint32_t n)
 {
 #if defined(HAVE___BUILTIN_BSWAP32)
@@ -687,7 +687,7 @@ util_bswap32(uint32_t n)
 /**
  * Reverse byte order of a 64bit word.
  */
-static INLINE uint64_t
+static inline uint64_t
 util_bswap64(uint64_t n)
 {
 #if defined(HAVE___BUILTIN_BSWAP64)
@@ -702,14 +702,14 @@ util_bswap64(uint64_t n)
 /**
  * Reverse byte order of a 16 bit word.
  */
-static INLINE uint16_t
+static inline uint16_t
 util_bswap16(uint16_t n)
 {
    return (n >> 8) |
           (n << 8);
 }
 
-static INLINE void*
+static inline void*
 util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t n)
 {
 #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -746,7 +746,7 @@ util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t
 /**
  * Align a value, only works pot alignemnts.
  */
-static INLINE int
+static inline int
 align(int value, int alignment)
 {
    return (value + alignment - 1) & ~(alignment - 1);
@@ -755,7 +755,7 @@ align(int value, int alignment)
 /**
  * Works like align but on npot alignments.
  */
-static INLINE size_t
+static inline size_t
 util_align_npot(size_t value, size_t alignment)
 {
    if (value % alignment)
@@ -763,7 +763,7 @@ util_align_npot(size_t value, size_t alignment)
    return value;
 }
 
-static INLINE unsigned
+static inline unsigned
 u_minify(unsigned value, unsigned levels)
 {
     return MAX2(1, value >> levels);
@@ -796,13 +796,13 @@ do {                                     \
 #endif
 
 
-static INLINE uint32_t
+static inline uint32_t
 util_unsigned_fixed(float value, unsigned frac_bits)
 {
    return value < 0 ? 0 : (uint32_t)(value * (1<<frac_bits));
 }
 
-static INLINE int32_t
+static inline int32_t
 util_signed_fixed(float value, unsigned frac_bits)
 {
    return (int32_t)(value * (1<<frac_bits));
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 9ff6c7da919..7fe0fe6f053 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -67,7 +67,7 @@ extern "C" {
 /**
  * Duplicate a block of memory.
  */
-static INLINE void *
+static inline void *
 mem_dup(const void *src, uint size)
 {
    void *dup = MALLOC(size);
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 82f83702d1e..2069b56f464 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -224,7 +224,7 @@ u_mmFindBlock(struct mem_block *heap, int start)
 }
 
 
-static INLINE int
+static inline int
 Join2Blocks(struct mem_block *p)
 {
    /* XXX there should be some assertions here */
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index e0c9018f8ef..b882502b7ba 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -60,7 +60,7 @@ union util_color {
 /**
  * Pack ubyte R,G,B,A into dest pixel.
  */
-static INLINE void
+static inline void
 util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
                    enum pipe_format format, union util_color *uc)
 {
@@ -161,7 +161,7 @@ util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
 /**
  * Unpack RGBA from a packed pixel, returning values as ubytes in [0,255].
  */
-static INLINE void
+static inline void
 util_unpack_color_ub(enum pipe_format format, union util_color *uc,
                      ubyte *r, ubyte *g, ubyte *b, ubyte *a)
 {
@@ -333,7 +333,7 @@ util_unpack_color_ub(enum pipe_format format, union util_color *uc,
  * This will not work (and might not really be useful with float input)
  * for pure integer formats (which lack the pack_rgba_float function).
  */
-static INLINE void
+static inline void
 util_pack_color(const float rgba[4], enum pipe_format format, union util_color *uc)
 {
    ubyte r = 0;
@@ -437,7 +437,7 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
 /* Integer versions of util_pack_z and util_pack_z_stencil - useful for
  * constructing clear masks.
  */
-static INLINE uint32_t
+static inline uint32_t
 util_pack_mask_z(enum pipe_format format, uint32_t z)
 {
    switch (format) {
@@ -462,7 +462,7 @@ util_pack_mask_z(enum pipe_format format, uint32_t z)
 }
 
 
-static INLINE uint64_t
+static inline uint64_t
 util_pack64_mask_z(enum pipe_format format, uint32_t z)
 {
    switch (format) {
@@ -474,7 +474,7 @@ util_pack64_mask_z(enum pipe_format format, uint32_t z)
 }
 
 
-static INLINE uint32_t
+static inline uint32_t
 util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 {
    uint32_t packed = util_pack_mask_z(format, z);
@@ -497,7 +497,7 @@ util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 }
 
 
-static INLINE uint64_t
+static inline uint64_t
 util_pack64_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 {
    uint64_t packed;
@@ -516,7 +516,7 @@ util_pack64_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 /**
  * Note: it's assumed that z is in [0,1]
  */
-static INLINE uint32_t
+static inline uint32_t
 util_pack_z(enum pipe_format format, double z)
 {
    union fi fui;
@@ -558,7 +558,7 @@ util_pack_z(enum pipe_format format, double z)
 }
 
 
-static INLINE uint64_t
+static inline uint64_t
 util_pack64_z(enum pipe_format format, double z)
 {
    union fi fui;
@@ -580,7 +580,7 @@ util_pack64_z(enum pipe_format format, double z)
  * Pack Z and/or stencil values into a 32-bit value described by format.
  * Note: it's assumed that z is in [0,1] and s in [0,255]
  */
-static INLINE uint32_t
+static inline uint32_t
 util_pack_z_stencil(enum pipe_format format, double z, uint8_t s)
 {
    uint32_t packed = util_pack_z(format, z);
@@ -603,7 +603,7 @@ util_pack_z_stencil(enum pipe_format format, double z, uint8_t s)
 }
 
 
-static INLINE uint64_t
+static inline uint64_t
 util_pack64_z_stencil(enum pipe_format format, double z, uint8_t s)
 {
    uint64_t packed;
@@ -624,7 +624,7 @@ util_pack64_z_stencil(enum pipe_format format, double z, uint8_t s)
 /**
  * Pack 4 ubytes into a 4-byte word
  */
-static INLINE unsigned
+static inline unsigned
 pack_ub4(ubyte b0, ubyte b1, ubyte b2, ubyte b3)
 {
    return ((((unsigned int)b0) << 0) |
@@ -637,7 +637,7 @@ pack_ub4(ubyte b0, ubyte b1, ubyte b2, ubyte b3)
 /**
  * Pack/convert 4 floats into one 4-byte word.
  */
-static INLINE unsigned
+static inline unsigned
 pack_ui32_float4(float a, float b, float c, float d)
 {
    return pack_ub4( float_to_ubyte(a),
diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h
index 30c23b79831..4f7a27ca61d 100644
--- a/src/gallium/auxiliary/util/u_pointer.h
+++ b/src/gallium/auxiliary/util/u_pointer.h
@@ -34,7 +34,7 @@
 extern "C" {
 #endif
 
-static INLINE intptr_t
+static inline intptr_t
 pointer_to_intptr( const void *p )
 {
    union {
@@ -45,7 +45,7 @@ pointer_to_intptr( const void *p )
    return pi.i;
 }
 
-static INLINE void *
+static inline void *
 intptr_to_pointer( intptr_t i )
 {
    union {
@@ -56,7 +56,7 @@ intptr_to_pointer( intptr_t i )
    return pi.p;
 }
 
-static INLINE uintptr_t
+static inline uintptr_t
 pointer_to_uintptr( const void *ptr )
 {
    union {
@@ -67,7 +67,7 @@ pointer_to_uintptr( const void *ptr )
    return pu.u;
 }
 
-static INLINE void *
+static inline void *
 uintptr_to_pointer( uintptr_t u )
 {
    union {
@@ -81,7 +81,7 @@ uintptr_to_pointer( uintptr_t u )
 /**
  * Return a pointer aligned to next multiple of N bytes.
  */
-static INLINE void *
+static inline void *
 align_pointer( const void *unaligned, uintptr_t alignment )
 {
    uintptr_t aligned = (pointer_to_uintptr( unaligned ) + alignment - 1) & ~(alignment - 1);
@@ -92,7 +92,7 @@ align_pointer( const void *unaligned, uintptr_t alignment )
 /**
  * Return a pointer aligned to next multiple of 16 bytes.
  */
-static INLINE void *
+static inline void *
 align16( void *unaligned )
 {
    return align_pointer( unaligned, 16 );
@@ -100,7 +100,7 @@ align16( void *unaligned )
 
 typedef void (*func_pointer)(void);
 
-static INLINE func_pointer
+static inline func_pointer
 pointer_to_func( void *p )
 {
    union {
@@ -111,7 +111,7 @@ pointer_to_func( void *p )
    return pf.f;
 }
 
-static INLINE void *
+static inline void *
 func_to_pointer( func_pointer f )
 {
    union {
diff --git a/src/gallium/auxiliary/util/u_prim.h b/src/gallium/auxiliary/util/u_prim.h
index b2dd44df230..366801545ed 100644
--- a/src/gallium/auxiliary/util/u_prim.h
+++ b/src/gallium/auxiliary/util/u_prim.h
@@ -46,7 +46,7 @@ struct u_prim_vertex_count {
  * Decompose a primitive that is a loop, a strip, or a fan.  Return the
  * original primitive if it is already decomposed.
  */
-static INLINE unsigned
+static inline unsigned
 u_decomposed_prim(unsigned prim)
 {
    switch (prim) {
@@ -71,7 +71,7 @@ u_decomposed_prim(unsigned prim)
  * Reduce a primitive to one of PIPE_PRIM_POINTS, PIPE_PRIM_LINES, and
  * PIPE_PRIM_TRIANGLES.
  */
-static INLINE unsigned
+static inline unsigned
 u_reduced_prim(unsigned prim)
 {
    switch (prim) {
@@ -91,7 +91,7 @@ u_reduced_prim(unsigned prim)
 /**
  * Re-assemble a primitive to remove its adjacency.
  */
-static INLINE unsigned
+static inline unsigned
 u_assembled_prim(unsigned prim)
 {
    switch (prim) {
@@ -113,7 +113,7 @@ u_assembled_prim(unsigned prim)
  * source file, it will increase the size of the binary slightly more than
  * expected because of the use of a table.
  */
-static INLINE const struct u_prim_vertex_count *
+static inline const struct u_prim_vertex_count *
 u_prim_vertex_count(unsigned prim)
 {
    static const struct u_prim_vertex_count prim_table[PIPE_PRIM_MAX] = {
@@ -140,7 +140,7 @@ u_prim_vertex_count(unsigned prim)
  * Given a vertex count, return the number of primitives.
  * For polygons, return the number of triangles.
  */
-static INLINE unsigned
+static inline unsigned
 u_prims_for_vertices(unsigned prim, unsigned num)
 {
    const struct u_prim_vertex_count *info = u_prim_vertex_count(prim);
@@ -151,7 +151,7 @@ u_prims_for_vertices(unsigned prim, unsigned num)
    return 1 + ((num - info->min) / info->incr);
 }
 
-static INLINE boolean u_validate_pipe_prim( unsigned pipe_prim, unsigned nr )
+static inline boolean u_validate_pipe_prim( unsigned pipe_prim, unsigned nr )
 {
    const struct u_prim_vertex_count *count = u_prim_vertex_count(pipe_prim);
 
@@ -159,7 +159,7 @@ static INLINE boolean u_validate_pipe_prim( unsigned pipe_prim, unsigned nr )
 }
 
 
-static INLINE boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr )
+static inline boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr )
 {
    const struct u_prim_vertex_count *count = u_prim_vertex_count(pipe_prim);
 
@@ -174,7 +174,7 @@ static INLINE boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr )
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 u_vertices_per_prim(int primitive)
 {
    switch(primitive) {
@@ -216,7 +216,7 @@ u_vertices_per_prim(int primitive)
  * statistics depend on knowing the exact number of decomposed
  * primitives for a set of vertices.
  */
-static INLINE unsigned
+static inline unsigned
 u_decomposed_prims_for_vertices(int primitive, int vertices)
 {
    switch (primitive) {
@@ -263,7 +263,7 @@ u_decomposed_prims_for_vertices(int primitive, int vertices)
  * count.  Each quad is treated as two triangles.  Polygons are treated as
  * triangle fans.
  */
-static INLINE unsigned
+static inline unsigned
 u_reduced_prims_for_vertices(int primitive, int vertices)
 {
    switch (primitive) {
diff --git a/src/gallium/auxiliary/util/u_range.h b/src/gallium/auxiliary/util/u_range.h
index efe25ef5e42..a1da5e5a6f0 100644
--- a/src/gallium/auxiliary/util/u_range.h
+++ b/src/gallium/auxiliary/util/u_range.h
@@ -47,7 +47,7 @@ struct util_range {
 };
 
 
-static INLINE void
+static inline void
 util_range_set_empty(struct util_range *range)
 {
    range->start = ~0;
@@ -55,7 +55,7 @@ util_range_set_empty(struct util_range *range)
 }
 
 /* This is like a union of two sets. */
-static INLINE void
+static inline void
 util_range_add(struct util_range *range, unsigned start, unsigned end)
 {
    if (start < range->start || end > range->end) {
@@ -66,7 +66,7 @@ util_range_add(struct util_range *range, unsigned start, unsigned end)
    }
 }
 
-static INLINE boolean
+static inline boolean
 util_ranges_intersect(struct util_range *range, unsigned start, unsigned end)
 {
    return MAX2(start, range->start) < MIN2(end, range->end);
@@ -75,14 +75,14 @@ util_ranges_intersect(struct util_range *range, unsigned start, unsigned end)
 
 /* Init/deinit */
 
-static INLINE void
+static inline void
 util_range_init(struct util_range *range)
 {
    pipe_mutex_init(range->write_mutex);
    util_range_set_empty(range);
 }
 
-static INLINE void
+static inline void
 util_range_destroy(struct util_range *range)
 {
    pipe_mutex_destroy(range->write_mutex);
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
index cf29dff0d02..b26f671f313 100644
--- a/src/gallium/auxiliary/util/u_rect.h
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -43,7 +43,7 @@ struct u_rect {
 
 /* Do two rectangles intersect?
  */
-static INLINE boolean
+static inline boolean
 u_rect_test_intersection(const struct u_rect *a,
                          const struct u_rect *b)
 {
@@ -55,7 +55,7 @@ u_rect_test_intersection(const struct u_rect *a,
 
 /* Find the intersection of two rectangles known to intersect.
  */
-static INLINE void
+static inline void
 u_rect_find_intersection(const struct u_rect *a,
                          struct u_rect *b)
 {
@@ -68,13 +68,13 @@ u_rect_find_intersection(const struct u_rect *a,
 }
 
 
-static INLINE int
+static inline int
 u_rect_area(const struct u_rect *r)
 {
    return (r->x1 - r->x0) * (r->y1 - r->y0);
 }
 
-static INLINE void
+static inline void
 u_rect_possible_intersection(const struct u_rect *a,
                              struct u_rect *b)
 {
@@ -88,7 +88,7 @@ u_rect_possible_intersection(const struct u_rect *a,
 
 /* Set @d to a rectangle that covers both @a and @b.
  */
-static INLINE void
+static inline void
 u_rect_union(struct u_rect *d, const struct u_rect *a, const struct u_rect *b)
 {
    d->x0 = MIN2(a->x0, b->x0);
diff --git a/src/gallium/auxiliary/util/u_resource.h b/src/gallium/auxiliary/util/u_resource.h
index a5e091fd66e..6736476f4da 100644
--- a/src/gallium/auxiliary/util/u_resource.h
+++ b/src/gallium/auxiliary/util/u_resource.h
@@ -36,7 +36,7 @@ util_resource_size(const struct pipe_resource *res);
  *
  * Note that this function returns true for single-layered array textures.
  */
-static INLINE boolean
+static inline boolean
 util_resource_is_array_texture(const struct pipe_resource *res)
 {
    switch (res->target) {
diff --git a/src/gallium/auxiliary/util/u_ringbuffer.c b/src/gallium/auxiliary/util/u_ringbuffer.c
index 648b105b137..5816b781660 100644
--- a/src/gallium/auxiliary/util/u_ringbuffer.c
+++ b/src/gallium/auxiliary/util/u_ringbuffer.c
@@ -56,7 +56,7 @@ void util_ringbuffer_destroy( struct util_ringbuffer *ring )
 /**
  * Return number of free entries in the ring
  */
-static INLINE unsigned util_ringbuffer_space( const struct util_ringbuffer *ring )
+static inline unsigned util_ringbuffer_space( const struct util_ringbuffer *ring )
 {
    return (ring->tail - (ring->head + 1)) & ring->mask;
 }
@@ -64,7 +64,7 @@ static INLINE unsigned util_ringbuffer_space( const struct util_ringbuffer *ring
 /**
  * Is the ring buffer empty?
  */
-static INLINE boolean util_ringbuffer_empty( const struct util_ringbuffer *ring )
+static inline boolean util_ringbuffer_empty( const struct util_ringbuffer *ring )
 {
    return util_ringbuffer_space(ring) == ring->mask;
 }
diff --git a/src/gallium/auxiliary/util/u_snprintf.c b/src/gallium/auxiliary/util/u_snprintf.c
deleted file mode 100644
index 39e9b70d0f8..00000000000
--- a/src/gallium/auxiliary/util/u_snprintf.c
+++ /dev/null
@@ -1,1480 +0,0 @@
-/*
- * Copyright (c) 1995 Patrick Powell.
- *
- * This code is based on code written by Patrick Powell <papowell@astart.com>.
- * It may be used for any purpose as long as this notice remains intact on all
- * source code distributions.
- */
-
-/*
- * Copyright (c) 2008 Holger Weiss.
- *
- * This version of the code is maintained by Holger Weiss <holger@jhweiss.de>.
- * My changes to the code may freely be used, modified and/or redistributed for
- * any purpose.  It would be nice if additions and fixes to this file (including
- * trivial code cleanups) would be sent back in order to let me include them in
- * the version available at <http://www.jhweiss.de/software/snprintf.html>.
- * However, this is not a requirement for using or redistributing (possibly
- * modified) versions of this file, nor is leaving this notice intact mandatory.
- */
-
-/*
- * History
- *
- * 2008-01-20 Holger Weiss <holger@jhweiss.de> for C99-snprintf 1.1:
- *
- * 	Fixed the detection of infinite floating point values on IRIX (and
- * 	possibly other systems) and applied another few minor cleanups.
- *
- * 2008-01-06 Holger Weiss <holger@jhweiss.de> for C99-snprintf 1.0:
- *
- * 	Added a lot of new features, fixed many bugs, and incorporated various
- * 	improvements done by Andrew Tridgell <tridge@samba.org>, Russ Allbery
- * 	<rra@stanford.edu>, Hrvoje Niksic <hniksic@xemacs.org>, Damien Miller
- * 	<djm@mindrot.org>, and others for the Samba, INN, Wget, and OpenSSH
- * 	projects.  The additions include: support the "e", "E", "g", "G", and
- * 	"F" conversion specifiers (and use conversion style "f" or "F" for the
- * 	still unsupported "a" and "A" specifiers); support the "hh", "ll", "j",
- * 	"t", and "z" length modifiers; support the "#" flag and the (non-C99)
- * 	"'" flag; use localeconv(3) (if available) to get both the current
- * 	locale's decimal point character and the separator between groups of
- * 	digits; fix the handling of various corner cases of field width and
- * 	precision specifications; fix various floating point conversion bugs;
- * 	handle infinite and NaN floating point values; don't attempt to write to
- * 	the output buffer (which may be NULL) if a size of zero was specified;
- * 	check for integer overflow of the field width, precision, and return
- * 	values and during the floating point conversion; use the OUTCHAR() macro
- * 	instead of a function for better performance; provide asprintf(3) and
- * 	vasprintf(3) functions; add new test cases.  The replacement functions
- * 	have been renamed to use an "rpl_" prefix, the function calls in the
- * 	main project (and in this file) must be redefined accordingly for each
- * 	replacement function which is needed (by using Autoconf or other means).
- * 	Various other minor improvements have been applied and the coding style
- * 	was cleaned up for consistency.
- *
- * 2007-07-23 Holger Weiss <holger@jhweiss.de> for Mutt 1.5.13:
- *
- * 	C99 compliant snprintf(3) and vsnprintf(3) functions return the number
- * 	of characters that would have been written to a sufficiently sized
- * 	buffer (excluding the '\0').  The original code simply returned the
- * 	length of the resulting output string, so that's been fixed.
- *
- * 1998-03-05 Michael Elkins <me@mutt.org> for Mutt 0.90.8:
- *
- * 	The original code assumed that both snprintf(3) and vsnprintf(3) were
- * 	missing.  Some systems only have snprintf(3) but not vsnprintf(3), so
- * 	the code is now broken down under HAVE_SNPRINTF and HAVE_VSNPRINTF.
- *
- * 1998-01-27 Thomas Roessler <roessler@does-not-exist.org> for Mutt 0.89i:
- *
- * 	The PGP code was using unsigned hexadecimal formats.  Unfortunately,
- * 	unsigned formats simply didn't work.
- *
- * 1997-10-22 Brandon Long <blong@fiction.net> for Mutt 0.87.1:
- *
- * 	Ok, added some minimal floating point support, which means this probably
- * 	requires libm on most operating systems.  Don't yet support the exponent
- * 	(e,E) and sigfig (g,G).  Also, fmtint() was pretty badly broken, it just
- * 	wasn't being exercised in ways which showed it, so that's been fixed.
- * 	Also, formatted the code to Mutt conventions, and removed dead code left
- * 	over from the original.  Also, there is now a builtin-test, run with:
- * 	gcc -DTEST_SNPRINTF -o snprintf snprintf.c -lm && ./snprintf
- *
- * 2996-09-15 Brandon Long <blong@fiction.net> for Mutt 0.43:
- *
- * 	This was ugly.  It is still ugly.  I opted out of floating point
- * 	numbers, but the formatter understands just about everything from the
- * 	normal C string format, at least as far as I can tell from the Solaris
- * 	2.5 printf(3S) man page.
- */
-
-/*
- * ToDo
- *
- * - Add wide character support.
- * - Add support for "%a" and "%A" conversions.
- * - Create test routines which predefine the expected results.  Our test cases
- *   usually expose bugs in system implementations rather than in ours :-)
- */
-
-/*
- * Usage
- *
- * 1) The following preprocessor macros should be defined to 1 if the feature or
- *    file in question is available on the target system (by using Autoconf or
- *    other means), though basic functionality should be available as long as
- *    HAVE_STDARG_H and HAVE_STDLIB_H are defined correctly:
- *
- *    	HAVE_VSNPRINTF
- *    	HAVE_SNPRINTF
- *    	HAVE_VASPRINTF
- *    	HAVE_ASPRINTF
- *    	HAVE_STDARG_H
- *    	HAVE_STDDEF_H
- *    	HAVE_STDINT_H
- *    	HAVE_STDLIB_H
- *    	HAVE_INTTYPES_H
- *    	HAVE_LOCALE_H
- *    	HAVE_LOCALECONV
- *    	HAVE_LCONV_DECIMAL_POINT
- *    	HAVE_LCONV_THOUSANDS_SEP
- *    	HAVE_LONG_DOUBLE
- *    	HAVE_LONG_LONG_INT
- *    	HAVE_UNSIGNED_LONG_LONG_INT
- *    	HAVE_INTMAX_T
- *    	HAVE_UINTMAX_T
- *    	HAVE_UINTPTR_T
- *    	HAVE_PTRDIFF_T
- *    	HAVE_VA_COPY
- *    	HAVE___VA_COPY
- *
- * 2) The calls to the functions which should be replaced must be redefined
- *    throughout the project files (by using Autoconf or other means):
- *
- *    	#define vsnprintf rpl_vsnprintf
- *    	#define snprintf rpl_snprintf
- *    	#define vasprintf rpl_vasprintf
- *    	#define asprintf rpl_asprintf
- *
- * 3) The required replacement functions should be declared in some header file
- *    included throughout the project files:
- *
- *    	#if HAVE_CONFIG_H
- *    	#include <config.h>
- *    	#endif
- *    	#if HAVE_STDARG_H
- *    	#include <stdarg.h>
- *    	#if !HAVE_VSNPRINTF
- *    	int rpl_vsnprintf(char *, size_t, const char *, va_list);
- *    	#endif
- *    	#if !HAVE_SNPRINTF
- *    	int rpl_snprintf(char *, size_t, const char *, ...);
- *    	#endif
- *    	#if !HAVE_VASPRINTF
- *    	int rpl_vasprintf(char **, const char *, va_list);
- *    	#endif
- *    	#if !HAVE_ASPRINTF
- *    	int rpl_asprintf(char **, const char *, ...);
- *    	#endif
- *    	#endif
- *
- * Autoconf macros for handling step 1 and step 2 are available at
- * <http://www.jhweiss.de/software/snprintf.html>.
- */
-
-#include "pipe/p_config.h"
-
-#if HAVE_CONFIG_H
-#include <config.h>
-#else
-#ifdef _MSC_VER
-#define vsnprintf util_vsnprintf
-#define snprintf util_snprintf
-#define HAVE_VSNPRINTF 0
-#define HAVE_SNPRINTF 0
-#define HAVE_VASPRINTF 1 /* not needed */
-#define HAVE_ASPRINTF 1 /* not needed */
-#define HAVE_STDARG_H 1
-#define HAVE_STDDEF_H 1
-#define HAVE_STDINT_H 1
-#define HAVE_STDLIB_H 1
-#define HAVE_INTTYPES_H 0
-#define HAVE_LOCALE_H 0
-#define HAVE_LOCALECONV 0
-#define HAVE_LCONV_DECIMAL_POINT 0
-#define HAVE_LCONV_THOUSANDS_SEP 0
-#define HAVE_LONG_DOUBLE 0
-#define HAVE_LONG_LONG_INT 1
-#define HAVE_UNSIGNED_LONG_LONG_INT 1
-#define HAVE_INTMAX_T 0
-#define HAVE_UINTMAX_T 0
-#define HAVE_UINTPTR_T 1
-#define HAVE_PTRDIFF_T 1
-#define HAVE_VA_COPY 0
-#define HAVE___VA_COPY 0
-#else
-#define HAVE_VSNPRINTF 1
-#define HAVE_SNPRINTF 1
-#define HAVE_VASPRINTF 1
-#define HAVE_ASPRINTF 1
-#endif
-#endif	/* HAVE_CONFIG_H */
-
-#if !HAVE_SNPRINTF || !HAVE_VSNPRINTF || !HAVE_ASPRINTF || !HAVE_VASPRINTF
-#include <stdio.h>	/* For NULL, size_t, vsnprintf(3), and vasprintf(3). */
-#ifdef VA_START
-#undef VA_START
-#endif	/* defined(VA_START) */
-#ifdef VA_SHIFT
-#undef VA_SHIFT
-#endif	/* defined(VA_SHIFT) */
-#if HAVE_STDARG_H
-#include <stdarg.h>
-#define VA_START(ap, last) va_start(ap, last)
-#define VA_SHIFT(ap, value, type) /* No-op for ANSI C. */
-#else	/* Assume <varargs.h> is available. */
-#include <varargs.h>
-#define VA_START(ap, last) va_start(ap)	/* "last" is ignored. */
-#define VA_SHIFT(ap, value, type) value = va_arg(ap, type)
-#endif	/* HAVE_STDARG_H */
-
-#if !HAVE_VASPRINTF
-#if HAVE_STDLIB_H
-#include <stdlib.h>	/* For malloc(3). */
-#endif	/* HAVE_STDLIB_H */
-#ifdef VA_COPY
-#undef VA_COPY
-#endif	/* defined(VA_COPY) */
-#ifdef VA_END_COPY
-#undef VA_END_COPY
-#endif	/* defined(VA_END_COPY) */
-#if HAVE_VA_COPY
-#define VA_COPY(dest, src) va_copy(dest, src)
-#define VA_END_COPY(ap) va_end(ap)
-#elif HAVE___VA_COPY
-#define VA_COPY(dest, src) __va_copy(dest, src)
-#define VA_END_COPY(ap) va_end(ap)
-#else
-#define VA_COPY(dest, src) (void)mymemcpy(&dest, &src, sizeof(va_list))
-#define VA_END_COPY(ap) /* No-op. */
-#define NEED_MYMEMCPY 1
-static void *mymemcpy(void *, void *, size_t);
-#endif	/* HAVE_VA_COPY */
-#endif	/* !HAVE_VASPRINTF */
-
-#if !HAVE_VSNPRINTF
-#include <limits.h>	/* For *_MAX. */
-#if HAVE_INTTYPES_H
-#include <inttypes.h>	/* For intmax_t (if not defined in <stdint.h>). */
-#endif	/* HAVE_INTTYPES_H */
-#if HAVE_LOCALE_H
-#include <locale.h>	/* For localeconv(3). */
-#endif	/* HAVE_LOCALE_H */
-#if HAVE_STDDEF_H
-#include <stddef.h>	/* For ptrdiff_t. */
-#endif	/* HAVE_STDDEF_H */
-#if HAVE_STDINT_H
-#include <stdint.h>	/* For intmax_t. */
-#endif	/* HAVE_STDINT_H */
-
-/* Support for unsigned long long int.  We may also need ULLONG_MAX. */
-#ifndef ULONG_MAX	/* We may need ULONG_MAX as a fallback. */
-#ifdef UINT_MAX
-#define ULONG_MAX UINT_MAX
-#else
-#define ULONG_MAX INT_MAX
-#endif	/* defined(UINT_MAX) */
-#endif	/* !defined(ULONG_MAX) */
-#ifdef ULLONG
-#undef ULLONG
-#endif	/* defined(ULLONG) */
-#if HAVE_UNSIGNED_LONG_LONG_INT
-#define ULLONG unsigned long long int
-#ifndef ULLONG_MAX
-#define ULLONG_MAX ULONG_MAX
-#endif	/* !defined(ULLONG_MAX) */
-#else
-#define ULLONG unsigned long int
-#ifdef ULLONG_MAX
-#undef ULLONG_MAX
-#endif	/* defined(ULLONG_MAX) */
-#define ULLONG_MAX ULONG_MAX
-#endif	/* HAVE_LONG_LONG_INT */
-
-/* Support for uintmax_t.  We also need UINTMAX_MAX. */
-#ifdef UINTMAX_T
-#undef UINTMAX_T
-#endif	/* defined(UINTMAX_T) */
-#if HAVE_UINTMAX_T || defined(uintmax_t)
-#define UINTMAX_T uintmax_t
-#ifndef UINTMAX_MAX
-#define UINTMAX_MAX ULLONG_MAX
-#endif	/* !defined(UINTMAX_MAX) */
-#else
-#define UINTMAX_T ULLONG
-#ifdef UINTMAX_MAX
-#undef UINTMAX_MAX
-#endif	/* defined(UINTMAX_MAX) */
-#define UINTMAX_MAX ULLONG_MAX
-#endif	/* HAVE_UINTMAX_T || defined(uintmax_t) */
-
-/* Support for long double. */
-#ifndef LDOUBLE
-#if HAVE_LONG_DOUBLE
-#define LDOUBLE long double
-#else
-#define LDOUBLE double
-#endif	/* HAVE_LONG_DOUBLE */
-#endif	/* !defined(LDOUBLE) */
-
-/* Support for long long int. */
-#ifndef LLONG
-#if HAVE_LONG_LONG_INT
-#define LLONG long long int
-#else
-#define LLONG long int
-#endif	/* HAVE_LONG_LONG_INT */
-#endif	/* !defined(LLONG) */
-
-/* Support for intmax_t. */
-#ifndef INTMAX_T
-#if HAVE_INTMAX_T || defined(intmax_t)
-#define INTMAX_T intmax_t
-#else
-#define INTMAX_T LLONG
-#endif	/* HAVE_INTMAX_T || defined(intmax_t) */
-#endif	/* !defined(INTMAX_T) */
-
-/* Support for uintptr_t. */
-#ifndef UINTPTR_T
-#if HAVE_UINTPTR_T || defined(uintptr_t)
-#define UINTPTR_T uintptr_t
-#else
-#define UINTPTR_T unsigned long int
-#endif	/* HAVE_UINTPTR_T || defined(uintptr_t) */
-#endif	/* !defined(UINTPTR_T) */
-
-/* Support for ptrdiff_t. */
-#ifndef PTRDIFF_T
-#if HAVE_PTRDIFF_T || defined(ptrdiff_t)
-#define PTRDIFF_T ptrdiff_t
-#else
-#define PTRDIFF_T long int
-#endif	/* HAVE_PTRDIFF_T || defined(ptrdiff_t) */
-#endif	/* !defined(PTRDIFF_T) */
-
-/*
- * We need an unsigned integer type corresponding to ptrdiff_t (cf. C99:
- * 7.19.6.1, 7).  However, we'll simply use PTRDIFF_T and convert it to an
- * unsigned type if necessary.  This should work just fine in practice.
- */
-#ifndef UPTRDIFF_T
-#define UPTRDIFF_T PTRDIFF_T
-#endif	/* !defined(UPTRDIFF_T) */
-
-/*
- * We need a signed integer type corresponding to size_t (cf. C99: 7.19.6.1, 7).
- * However, we'll simply use size_t and convert it to a signed type if
- * necessary.  This should work just fine in practice.
- */
-#ifndef SSIZE_T
-#define SSIZE_T size_t
-#endif	/* !defined(SSIZE_T) */
-
-/* Either ERANGE or E2BIG should be available everywhere. */
-#ifndef ERANGE
-#define ERANGE E2BIG
-#endif	/* !defined(ERANGE) */
-#ifndef EOVERFLOW
-#define EOVERFLOW ERANGE
-#endif	/* !defined(EOVERFLOW) */
-
-/*
- * Buffer size to hold the octal string representation of UINT128_MAX without
- * nul-termination ("3777777777777777777777777777777777777777777").
- */
-#ifdef MAX_CONVERT_LENGTH
-#undef MAX_CONVERT_LENGTH
-#endif	/* defined(MAX_CONVERT_LENGTH) */
-#define MAX_CONVERT_LENGTH      43
-
-/* Format read states. */
-#define PRINT_S_DEFAULT         0
-#define PRINT_S_FLAGS           1
-#define PRINT_S_WIDTH           2
-#define PRINT_S_DOT             3
-#define PRINT_S_PRECISION       4
-#define PRINT_S_MOD             5
-#define PRINT_S_CONV            6
-
-/* Format flags. */
-#define PRINT_F_MINUS           (1 << 0)
-#define PRINT_F_PLUS            (1 << 1)
-#define PRINT_F_SPACE           (1 << 2)
-#define PRINT_F_NUM             (1 << 3)
-#define PRINT_F_ZERO            (1 << 4)
-#define PRINT_F_QUOTE           (1 << 5)
-#define PRINT_F_UP              (1 << 6)
-#define PRINT_F_UNSIGNED        (1 << 7)
-#define PRINT_F_TYPE_G          (1 << 8)
-#define PRINT_F_TYPE_E          (1 << 9)
-
-/* Conversion flags. */
-#define PRINT_C_CHAR            1
-#define PRINT_C_SHORT           2
-#define PRINT_C_LONG            3
-#define PRINT_C_LLONG           4
-#define PRINT_C_LDOUBLE         5
-#define PRINT_C_SIZE            6
-#define PRINT_C_PTRDIFF         7
-#define PRINT_C_INTMAX          8
-
-#ifndef MAX
-#define MAX(x, y) ((x >= y) ? x : y)
-#endif	/* !defined(MAX) */
-#ifndef CHARTOINT
-#define CHARTOINT(ch) (ch - '0')
-#endif	/* !defined(CHARTOINT) */
-#ifndef ISDIGIT
-#define ISDIGIT(ch) ('0' <= (unsigned char)ch && (unsigned char)ch <= '9')
-#endif	/* !defined(ISDIGIT) */
-#ifndef ISNAN
-#define ISNAN(x) (x != x)
-#endif	/* !defined(ISNAN) */
-#ifndef ISINF
-#define ISINF(x) (x != 0.0 && x + x == x)
-#endif	/* !defined(ISINF) */
-
-#ifdef OUTCHAR
-#undef OUTCHAR
-#endif	/* defined(OUTCHAR) */
-#define OUTCHAR(str, len, size, ch)                                          \
-do {                                                                         \
-	if (len + 1 < size)                                                  \
-		str[len] = ch;                                               \
-	(len)++;                                                             \
-} while (/* CONSTCOND */ 0)
-
-static void fmtstr(char *, size_t *, size_t, const char *, int, int, int);
-static void fmtint(char *, size_t *, size_t, INTMAX_T, int, int, int, int);
-static void fmtflt(char *, size_t *, size_t, LDOUBLE, int, int, int, int *);
-static void printsep(char *, size_t *, size_t);
-static int getnumsep(int);
-static int getexponent(LDOUBLE);
-static int convert(UINTMAX_T, char *, size_t, int, int);
-static UINTMAX_T cast(LDOUBLE);
-static UINTMAX_T myround(LDOUBLE);
-static LDOUBLE mypow10(int);
-
-int
-util_vsnprintf(char *str, size_t size, const char *format, va_list args)
-{
-	LDOUBLE fvalue;
-	INTMAX_T value;
-	unsigned char cvalue;
-	const char *strvalue;
-	INTMAX_T *intmaxptr;
-	PTRDIFF_T *ptrdiffptr;
-	SSIZE_T *sizeptr;
-	LLONG *llongptr;
-	long int *longptr;
-	int *intptr;
-	short int *shortptr;
-	signed char *charptr;
-	size_t len = 0;
-	int overflow = 0;
-	int base = 0;
-	int cflags = 0;
-	int flags = 0;
-	int width = 0;
-	int precision = -1;
-	int state = PRINT_S_DEFAULT;
-	char ch = *format++;
-
-	/*
-	 * C99 says: "If `n' is zero, nothing is written, and `s' may be a null
-	 * pointer." (7.19.6.5, 2)  We're forgiving and allow a NULL pointer
-	 * even if a size larger than zero was specified.  At least NetBSD's
-	 * snprintf(3) does the same, as well as other versions of this file.
-	 * (Though some of these versions will write to a non-NULL buffer even
-	 * if a size of zero was specified, which violates the standard.)
-	 */
-	if (str == NULL && size != 0)
-		size = 0;
-
-	while (ch != '\0')
-		switch (state) {
-		case PRINT_S_DEFAULT:
-			if (ch == '%')
-				state = PRINT_S_FLAGS;
-			else
-				OUTCHAR(str, len, size, ch);
-			ch = *format++;
-			break;
-		case PRINT_S_FLAGS:
-			switch (ch) {
-			case '-':
-				flags |= PRINT_F_MINUS;
-				ch = *format++;
-				break;
-			case '+':
-				flags |= PRINT_F_PLUS;
-				ch = *format++;
-				break;
-			case ' ':
-				flags |= PRINT_F_SPACE;
-				ch = *format++;
-				break;
-			case '#':
-				flags |= PRINT_F_NUM;
-				ch = *format++;
-				break;
-			case '0':
-				flags |= PRINT_F_ZERO;
-				ch = *format++;
-				break;
-			case '\'':	/* SUSv2 flag (not in C99). */
-				flags |= PRINT_F_QUOTE;
-				ch = *format++;
-				break;
-			default:
-				state = PRINT_S_WIDTH;
-				break;
-			}
-			break;
-		case PRINT_S_WIDTH:
-			if (ISDIGIT(ch)) {
-				ch = CHARTOINT(ch);
-				if (width > (INT_MAX - ch) / 10) {
-					overflow = 1;
-					goto out;
-				}
-				width = 10 * width + ch;
-				ch = *format++;
-			} else if (ch == '*') {
-				/*
-				 * C99 says: "A negative field width argument is
-				 * taken as a `-' flag followed by a positive
-				 * field width." (7.19.6.1, 5)
-				 */
-				if ((width = va_arg(args, int)) < 0) {
-					flags |= PRINT_F_MINUS;
-					width = -width;
-				}
-				ch = *format++;
-				state = PRINT_S_DOT;
-			} else
-				state = PRINT_S_DOT;
-			break;
-		case PRINT_S_DOT:
-			if (ch == '.') {
-				state = PRINT_S_PRECISION;
-				ch = *format++;
-			} else
-				state = PRINT_S_MOD;
-			break;
-		case PRINT_S_PRECISION:
-			if (precision == -1)
-				precision = 0;
-			if (ISDIGIT(ch)) {
-				ch = CHARTOINT(ch);
-				if (precision > (INT_MAX - ch) / 10) {
-					overflow = 1;
-					goto out;
-				}
-				precision = 10 * precision + ch;
-				ch = *format++;
-			} else if (ch == '*') {
-				/*
-				 * C99 says: "A negative precision argument is
-				 * taken as if the precision were omitted."
-				 * (7.19.6.1, 5)
-				 */
-				if ((precision = va_arg(args, int)) < 0)
-					precision = -1;
-				ch = *format++;
-				state = PRINT_S_MOD;
-			} else
-				state = PRINT_S_MOD;
-			break;
-		case PRINT_S_MOD:
-			switch (ch) {
-			case 'h':
-				ch = *format++;
-				if (ch == 'h') {	/* It's a char. */
-					ch = *format++;
-					cflags = PRINT_C_CHAR;
-				} else
-					cflags = PRINT_C_SHORT;
-				break;
-			case 'l':
-				ch = *format++;
-				if (ch == 'l') {	/* It's a long long. */
-					ch = *format++;
-					cflags = PRINT_C_LLONG;
-				} else
-					cflags = PRINT_C_LONG;
-				break;
-			case 'L':
-				cflags = PRINT_C_LDOUBLE;
-				ch = *format++;
-				break;
-			case 'j':
-				cflags = PRINT_C_INTMAX;
-				ch = *format++;
-				break;
-			case 't':
-				cflags = PRINT_C_PTRDIFF;
-				ch = *format++;
-				break;
-			case 'z':
-				cflags = PRINT_C_SIZE;
-				ch = *format++;
-				break;
-			}
-			state = PRINT_S_CONV;
-			break;
-		case PRINT_S_CONV:
-			switch (ch) {
-			case 'd':
-				/* FALLTHROUGH */
-			case 'i':
-				switch (cflags) {
-				case PRINT_C_CHAR:
-					value = (signed char)va_arg(args, int);
-					break;
-				case PRINT_C_SHORT:
-					value = (short int)va_arg(args, int);
-					break;
-				case PRINT_C_LONG:
-					value = va_arg(args, long int);
-					break;
-				case PRINT_C_LLONG:
-					value = va_arg(args, LLONG);
-					break;
-				case PRINT_C_SIZE:
-					value = va_arg(args, SSIZE_T);
-					break;
-				case PRINT_C_INTMAX:
-					value = va_arg(args, INTMAX_T);
-					break;
-				case PRINT_C_PTRDIFF:
-					value = va_arg(args, PTRDIFF_T);
-					break;
-				default:
-					value = va_arg(args, int);
-					break;
-				}
-				fmtint(str, &len, size, value, 10, width,
-				    precision, flags);
-				break;
-			case 'X':
-				flags |= PRINT_F_UP;
-				/* FALLTHROUGH */
-			case 'x':
-				base = 16;
-				/* FALLTHROUGH */
-			case 'o':
-				if (base == 0)
-					base = 8;
-				/* FALLTHROUGH */
-			case 'u':
-				if (base == 0)
-					base = 10;
-				flags |= PRINT_F_UNSIGNED;
-				switch (cflags) {
-				case PRINT_C_CHAR:
-					value = (unsigned char)va_arg(args,
-					    unsigned int);
-					break;
-				case PRINT_C_SHORT:
-					value = (unsigned short int)va_arg(args,
-					    unsigned int);
-					break;
-				case PRINT_C_LONG:
-					value = va_arg(args, unsigned long int);
-					break;
-				case PRINT_C_LLONG:
-					value = va_arg(args, ULLONG);
-					break;
-				case PRINT_C_SIZE:
-					value = va_arg(args, size_t);
-					break;
-				case PRINT_C_INTMAX:
-					value = va_arg(args, UINTMAX_T);
-					break;
-				case PRINT_C_PTRDIFF:
-					value = va_arg(args, UPTRDIFF_T);
-					break;
-				default:
-					value = va_arg(args, unsigned int);
-					break;
-				}
-				fmtint(str, &len, size, value, base, width,
-				    precision, flags);
-				break;
-			case 'A':
-				/* Not yet supported, we'll use "%F". */
-				/* FALLTHROUGH */
-			case 'F':
-				flags |= PRINT_F_UP;
-			case 'a':
-				/* Not yet supported, we'll use "%f". */
-				/* FALLTHROUGH */
-			case 'f':
-				if (cflags == PRINT_C_LDOUBLE)
-					fvalue = va_arg(args, LDOUBLE);
-				else
-					fvalue = va_arg(args, double);
-				fmtflt(str, &len, size, fvalue, width,
-				    precision, flags, &overflow);
-				if (overflow)
-					goto out;
-				break;
-			case 'E':
-				flags |= PRINT_F_UP;
-				/* FALLTHROUGH */
-			case 'e':
-				flags |= PRINT_F_TYPE_E;
-				if (cflags == PRINT_C_LDOUBLE)
-					fvalue = va_arg(args, LDOUBLE);
-				else
-					fvalue = va_arg(args, double);
-				fmtflt(str, &len, size, fvalue, width,
-				    precision, flags, &overflow);
-				if (overflow)
-					goto out;
-				break;
-			case 'G':
-				flags |= PRINT_F_UP;
-				/* FALLTHROUGH */
-			case 'g':
-				flags |= PRINT_F_TYPE_G;
-				if (cflags == PRINT_C_LDOUBLE)
-					fvalue = va_arg(args, LDOUBLE);
-				else
-					fvalue = va_arg(args, double);
-				/*
-				 * If the precision is zero, it is treated as
-				 * one (cf. C99: 7.19.6.1, 8).
-				 */
-				if (precision == 0)
-					precision = 1;
-				fmtflt(str, &len, size, fvalue, width,
-				    precision, flags, &overflow);
-				if (overflow)
-					goto out;
-				break;
-			case 'c':
-				cvalue = (unsigned char)va_arg(args, int);
-				OUTCHAR(str, len, size, cvalue);
-				break;
-			case 's':
-				strvalue = va_arg(args, char *);
-				fmtstr(str, &len, size, strvalue, width,
-				    precision, flags);
-				break;
-			case 'p':
-				/*
-				 * C99 says: "The value of the pointer is
-				 * converted to a sequence of printing
-				 * characters, in an implementation-defined
-				 * manner." (C99: 7.19.6.1, 8)
-				 */
-				if ((strvalue = va_arg(args, void *)) == NULL)
-					/*
-					 * We use the glibc format.  BSD prints
-					 * "0x0", SysV "0".
-					 */
-					fmtstr(str, &len, size, "(nil)", width,
-					    -1, flags);
-				else {
-					/*
-					 * We use the BSD/glibc format.  SysV
-					 * omits the "0x" prefix (which we emit
-					 * using the PRINT_F_NUM flag).
-					 */
-					flags |= PRINT_F_NUM;
-					flags |= PRINT_F_UNSIGNED;
-					fmtint(str, &len, size,
-					    (UINTPTR_T)strvalue, 16, width,
-					    precision, flags);
-				}
-				break;
-			case 'n':
-				switch (cflags) {
-				case PRINT_C_CHAR:
-					charptr = va_arg(args, signed char *);
-					*charptr = (signed char)len;
-					break;
-				case PRINT_C_SHORT:
-					shortptr = va_arg(args, short int *);
-					*shortptr = (short int)len;
-					break;
-				case PRINT_C_LONG:
-					longptr = va_arg(args, long int *);
-					*longptr = (long int)len;
-					break;
-				case PRINT_C_LLONG:
-					llongptr = va_arg(args, LLONG *);
-					*llongptr = (LLONG)len;
-					break;
-				case PRINT_C_SIZE:
-					/*
-					 * C99 says that with the "z" length
-					 * modifier, "a following `n' conversion
-					 * specifier applies to a pointer to a
-					 * signed integer type corresponding to
-					 * size_t argument." (7.19.6.1, 7)
-					 */
-					sizeptr = va_arg(args, SSIZE_T *);
-					*sizeptr = len;
-					break;
-				case PRINT_C_INTMAX:
-					intmaxptr = va_arg(args, INTMAX_T *);
-					*intmaxptr = len;
-					break;
-				case PRINT_C_PTRDIFF:
-					ptrdiffptr = va_arg(args, PTRDIFF_T *);
-					*ptrdiffptr = len;
-					break;
-				default:
-					intptr = va_arg(args, int *);
-					*intptr = (int)len;
-					break;
-				}
-				break;
-			case '%':	/* Print a "%" character verbatim. */
-				OUTCHAR(str, len, size, ch);
-				break;
-			default:	/* Skip other characters. */
-				break;
-			}
-			ch = *format++;
-			state = PRINT_S_DEFAULT;
-			base = cflags = flags = width = 0;
-			precision = -1;
-			break;
-		}
-out:
-	if (len < size)
-		str[len] = '\0';
-	else if (size > 0)
-		str[size - 1] = '\0';
-
-	if (overflow || len >= INT_MAX) {
-		return -1;
-	}
-	return (int)len;
-}
-
-static void
-fmtstr(char *str, size_t *len, size_t size, const char *value, int width,
-       int precision, int flags)
-{
-	int padlen, strln;	/* Amount to pad. */
-	int noprecision = (precision == -1);
-
-	if (value == NULL)	/* We're forgiving. */
-		value = "(null)";
-
-	/* If a precision was specified, don't read the string past it. */
-	for (strln = 0; value[strln] != '\0' &&
-	    (noprecision || strln < precision); strln++)
-		continue;
-
-	if ((padlen = width - strln) < 0)
-		padlen = 0;
-	if (flags & PRINT_F_MINUS)	/* Left justify. */
-		padlen = -padlen;
-
-	while (padlen > 0) {	/* Leading spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		padlen--;
-	}
-	while (*value != '\0' && (noprecision || precision-- > 0)) {
-		OUTCHAR(str, *len, size, *value);
-		value++;
-	}
-	while (padlen < 0) {	/* Trailing spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		padlen++;
-	}
-}
-
-static void
-fmtint(char *str, size_t *len, size_t size, INTMAX_T value, int base, int width,
-       int precision, int flags)
-{
-	UINTMAX_T uvalue;
-	char iconvert[MAX_CONVERT_LENGTH];
-	char sign = 0;
-	char hexprefix = 0;
-	int spadlen = 0;	/* Amount to space pad. */
-	int zpadlen = 0;	/* Amount to zero pad. */
-	int pos;
-	int separators = (flags & PRINT_F_QUOTE);
-	int noprecision = (precision == -1);
-
-	if (flags & PRINT_F_UNSIGNED)
-		uvalue = value;
-	else {
-		uvalue = (value >= 0) ? value : -value;
-		if (value < 0)
-			sign = '-';
-		else if (flags & PRINT_F_PLUS)	/* Do a sign. */
-			sign = '+';
-		else if (flags & PRINT_F_SPACE)
-			sign = ' ';
-	}
-
-	pos = convert(uvalue, iconvert, sizeof(iconvert), base,
-	    flags & PRINT_F_UP);
-
-	if (flags & PRINT_F_NUM && uvalue != 0) {
-		/*
-		 * C99 says: "The result is converted to an `alternative form'.
-		 * For `o' conversion, it increases the precision, if and only
-		 * if necessary, to force the first digit of the result to be a
-		 * zero (if the value and precision are both 0, a single 0 is
-		 * printed).  For `x' (or `X') conversion, a nonzero result has
-		 * `0x' (or `0X') prefixed to it." (7.19.6.1, 6)
-		 */
-		switch (base) {
-		case 8:
-			if (precision <= pos)
-				precision = pos + 1;
-			break;
-		case 16:
-			hexprefix = (flags & PRINT_F_UP) ? 'X' : 'x';
-			break;
-		}
-	}
-
-	if (separators)	/* Get the number of group separators we'll print. */
-		separators = getnumsep(pos);
-
-	zpadlen = precision - pos - separators;
-	spadlen = width                         /* Minimum field width. */
-	    - separators                        /* Number of separators. */
-	    - MAX(precision, pos)               /* Number of integer digits. */
-	    - ((sign != 0) ? 1 : 0)             /* Will we print a sign? */
-	    - ((hexprefix != 0) ? 2 : 0);       /* Will we print a prefix? */
-
-	if (zpadlen < 0)
-		zpadlen = 0;
-	if (spadlen < 0)
-		spadlen = 0;
-
-	/*
-	 * C99 says: "If the `0' and `-' flags both appear, the `0' flag is
-	 * ignored.  For `d', `i', `o', `u', `x', and `X' conversions, if a
-	 * precision is specified, the `0' flag is ignored." (7.19.6.1, 6)
-	 */
-	if (flags & PRINT_F_MINUS)	/* Left justify. */
-		spadlen = -spadlen;
-	else if (flags & PRINT_F_ZERO && noprecision) {
-		zpadlen += spadlen;
-		spadlen = 0;
-	}
-	while (spadlen > 0) {	/* Leading spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		spadlen--;
-	}
-	if (sign != 0)	/* Sign. */
-		OUTCHAR(str, *len, size, sign);
-	if (hexprefix != 0) {	/* A "0x" or "0X" prefix. */
-		OUTCHAR(str, *len, size, '0');
-		OUTCHAR(str, *len, size, hexprefix);
-	}
-	while (zpadlen > 0) {	/* Leading zeros. */
-		OUTCHAR(str, *len, size, '0');
-		zpadlen--;
-	}
-	while (pos > 0) {	/* The actual digits. */
-		pos--;
-		OUTCHAR(str, *len, size, iconvert[pos]);
-		if (separators > 0 && pos > 0 && pos % 3 == 0)
-			printsep(str, len, size);
-	}
-	while (spadlen < 0) {	/* Trailing spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		spadlen++;
-	}
-}
-
-static void
-fmtflt(char *str, size_t *len, size_t size, LDOUBLE fvalue, int width,
-       int precision, int flags, int *overflow)
-{
-	LDOUBLE ufvalue;
-	UINTMAX_T intpart;
-	UINTMAX_T fracpart;
-	UINTMAX_T mask;
-	const char *infnan = NULL;
-	char iconvert[MAX_CONVERT_LENGTH];
-	char fconvert[MAX_CONVERT_LENGTH];
-	char econvert[4];	/* "e-12" (without nul-termination). */
-	char esign = 0;
-	char sign = 0;
-	int leadfraczeros = 0;
-	int exponent = 0;
-	int emitpoint = 0;
-	int omitzeros = 0;
-	int omitcount = 0;
-	int padlen = 0;
-	int epos = 0;
-	int fpos = 0;
-	int ipos = 0;
-	int separators = (flags & PRINT_F_QUOTE);
-	int estyle = (flags & PRINT_F_TYPE_E);
-#if HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT
-	struct lconv *lc = localeconv();
-#endif	/* HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT */
-
-	/*
-	 * AIX' man page says the default is 0, but C99 and at least Solaris'
-	 * and NetBSD's man pages say the default is 6, and sprintf(3) on AIX
-	 * defaults to 6.
-	 */
-	if (precision == -1)
-		precision = 6;
-
-	if (fvalue < 0.0)
-		sign = '-';
-	else if (flags & PRINT_F_PLUS)	/* Do a sign. */
-		sign = '+';
-	else if (flags & PRINT_F_SPACE)
-		sign = ' ';
-
-	if (ISNAN(fvalue))
-		infnan = (flags & PRINT_F_UP) ? "NAN" : "nan";
-	else if (ISINF(fvalue))
-		infnan = (flags & PRINT_F_UP) ? "INF" : "inf";
-
-	if (infnan != NULL) {
-		if (sign != 0)
-			iconvert[ipos++] = sign;
-		while (*infnan != '\0')
-			iconvert[ipos++] = *infnan++;
-		fmtstr(str, len, size, iconvert, width, ipos, flags);
-		return;
-	}
-
-	/* "%e" (or "%E") or "%g" (or "%G") conversion. */
-	if (flags & PRINT_F_TYPE_E || flags & PRINT_F_TYPE_G) {
-		if (flags & PRINT_F_TYPE_G) {
-			/*
-			 * For "%g" (and "%G") conversions, the precision
-			 * specifies the number of significant digits, which
-			 * includes the digits in the integer part.  The
-			 * conversion will or will not be using "e-style" (like
-			 * "%e" or "%E" conversions) depending on the precision
-			 * and on the exponent.  However, the exponent can be
-			 * affected by rounding the converted value, so we'll
-			 * leave this decision for later.  Until then, we'll
-			 * assume that we're going to do an "e-style" conversion
-			 * (in order to get the exponent calculated).  For
-			 * "e-style", the precision must be decremented by one.
-			 */
-			precision--;
-			/*
-			 * For "%g" (and "%G") conversions, trailing zeros are
-			 * removed from the fractional portion of the result
-			 * unless the "#" flag was specified.
-			 */
-			if (!(flags & PRINT_F_NUM))
-				omitzeros = 1;
-		}
-		exponent = getexponent(fvalue);
-		estyle = 1;
-	}
-
-again:
-	/*
-	 * Sorry, we only support 9, 19, or 38 digits (that is, the number of
-	 * digits of the 32-bit, the 64-bit, or the 128-bit UINTMAX_MAX value
-	 * minus one) past the decimal point due to our conversion method.
-	 */
-	switch (sizeof(UINTMAX_T)) {
-	case 16:
-		if (precision > 38)
-			precision = 38;
-		break;
-	case 8:
-		if (precision > 19)
-			precision = 19;
-		break;
-	default:
-		if (precision > 9)
-			precision = 9;
-		break;
-	}
-
-	ufvalue = (fvalue >= 0.0) ? fvalue : -fvalue;
-	if (estyle)	/* We want exactly one integer digit. */
-		ufvalue /= mypow10(exponent);
-
-	if ((intpart = cast(ufvalue)) == UINTMAX_MAX) {
-		*overflow = 1;
-		return;
-	}
-
-	/*
-	 * Factor of ten with the number of digits needed for the fractional
-	 * part.  For example, if the precision is 3, the mask will be 1000.
-	 */
-	mask = (UINTMAX_T)mypow10(precision);
-	/*
-	 * We "cheat" by converting the fractional part to integer by
-	 * multiplying by a factor of ten.
-	 */
-	if ((fracpart = myround(mask * (ufvalue - intpart))) >= mask) {
-		/*
-		 * For example, ufvalue = 2.99962, intpart = 2, and mask = 1000
-		 * (because precision = 3).  Now, myround(1000 * 0.99962) will
-		 * return 1000.  So, the integer part must be incremented by one
-		 * and the fractional part must be set to zero.
-		 */
-		intpart++;
-		fracpart = 0;
-		if (estyle && intpart == 10) {
-			/*
-			 * The value was rounded up to ten, but we only want one
-			 * integer digit if using "e-style".  So, the integer
-			 * part must be set to one and the exponent must be
-			 * incremented by one.
-			 */
-			intpart = 1;
-			exponent++;
-		}
-	}
-
-	/*
-	 * Now that we know the real exponent, we can check whether or not to
-	 * use "e-style" for "%g" (and "%G") conversions.  If we don't need
-	 * "e-style", the precision must be adjusted and the integer and
-	 * fractional parts must be recalculated from the original value.
-	 *
-	 * C99 says: "Let P equal the precision if nonzero, 6 if the precision
-	 * is omitted, or 1 if the precision is zero.  Then, if a conversion
-	 * with style `E' would have an exponent of X:
-	 *
-	 * - if P > X >= -4, the conversion is with style `f' (or `F') and
-	 *   precision P - (X + 1).
-	 *
-	 * - otherwise, the conversion is with style `e' (or `E') and precision
-	 *   P - 1." (7.19.6.1, 8)
-	 *
-	 * Note that we had decremented the precision by one.
-	 */
-	if (flags & PRINT_F_TYPE_G && estyle &&
-	    precision + 1 > exponent && exponent >= -4) {
-		precision -= exponent;
-		estyle = 0;
-		goto again;
-	}
-
-	if (estyle) {
-		if (exponent < 0) {
-			exponent = -exponent;
-			esign = '-';
-		} else
-			esign = '+';
-
-		/*
-		 * Convert the exponent.  The sizeof(econvert) is 4.  So, the
-		 * econvert buffer can hold e.g. "e+99" and "e-99".  We don't
-		 * support an exponent which contains more than two digits.
-		 * Therefore, the following stores are safe.
-		 */
-		epos = convert(exponent, econvert, 2, 10, 0);
-		/*
-		 * C99 says: "The exponent always contains at least two digits,
-		 * and only as many more digits as necessary to represent the
-		 * exponent." (7.19.6.1, 8)
-		 */
-		if (epos == 1)
-			econvert[epos++] = '0';
-		econvert[epos++] = esign;
-		econvert[epos++] = (flags & PRINT_F_UP) ? 'E' : 'e';
-	}
-
-	/* Convert the integer part and the fractional part. */
-	ipos = convert(intpart, iconvert, sizeof(iconvert), 10, 0);
-	if (fracpart != 0)	/* convert() would return 1 if fracpart == 0. */
-		fpos = convert(fracpart, fconvert, sizeof(fconvert), 10, 0);
-
-	leadfraczeros = precision - fpos;
-
-	if (omitzeros) {
-		if (fpos > 0)	/* Omit trailing fractional part zeros. */
-			while (omitcount < fpos && fconvert[omitcount] == '0')
-				omitcount++;
-		else {	/* The fractional part is zero, omit it completely. */
-			omitcount = precision;
-			leadfraczeros = 0;
-		}
-		precision -= omitcount;
-	}
-
-	/*
-	 * Print a decimal point if either the fractional part is non-zero
-	 * and/or the "#" flag was specified.
-	 */
-	if (precision > 0 || flags & PRINT_F_NUM)
-		emitpoint = 1;
-	if (separators)	/* Get the number of group separators we'll print. */
-		separators = getnumsep(ipos);
-
-	padlen = width                  /* Minimum field width. */
-	    - ipos                      /* Number of integer digits. */
-	    - epos                      /* Number of exponent characters. */
-	    - precision                 /* Number of fractional digits. */
-	    - separators                /* Number of group separators. */
-	    - (emitpoint ? 1 : 0)       /* Will we print a decimal point? */
-	    - ((sign != 0) ? 1 : 0);    /* Will we print a sign character? */
-
-	if (padlen < 0)
-		padlen = 0;
-
-	/*
-	 * C99 says: "If the `0' and `-' flags both appear, the `0' flag is
-	 * ignored." (7.19.6.1, 6)
-	 */
-	if (flags & PRINT_F_MINUS)	/* Left justifty. */
-		padlen = -padlen;
-	else if (flags & PRINT_F_ZERO && padlen > 0) {
-		if (sign != 0) {	/* Sign. */
-			OUTCHAR(str, *len, size, sign);
-			sign = 0;
-		}
-		while (padlen > 0) {	/* Leading zeros. */
-			OUTCHAR(str, *len, size, '0');
-			padlen--;
-		}
-	}
-	while (padlen > 0) {	/* Leading spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		padlen--;
-	}
-	if (sign != 0)	/* Sign. */
-		OUTCHAR(str, *len, size, sign);
-	while (ipos > 0) {	/* Integer part. */
-		ipos--;
-		OUTCHAR(str, *len, size, iconvert[ipos]);
-		if (separators > 0 && ipos > 0 && ipos % 3 == 0)
-			printsep(str, len, size);
-	}
-	if (emitpoint) {	/* Decimal point. */
-#if HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT
-		if (lc->decimal_point != NULL && *lc->decimal_point != '\0')
-			OUTCHAR(str, *len, size, *lc->decimal_point);
-		else	/* We'll always print some decimal point character. */
-#endif	/* HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT */
-			OUTCHAR(str, *len, size, '.');
-	}
-	while (leadfraczeros > 0) {	/* Leading fractional part zeros. */
-		OUTCHAR(str, *len, size, '0');
-		leadfraczeros--;
-	}
-	while (fpos > omitcount) {	/* The remaining fractional part. */
-		fpos--;
-		OUTCHAR(str, *len, size, fconvert[fpos]);
-	}
-	while (epos > 0) {	/* Exponent. */
-		epos--;
-		OUTCHAR(str, *len, size, econvert[epos]);
-	}
-	while (padlen < 0) {	/* Trailing spaces. */
-		OUTCHAR(str, *len, size, ' ');
-		padlen++;
-	}
-}
-
-static void
-printsep(char *str, size_t *len, size_t size)
-{
-#if HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP
-	struct lconv *lc = localeconv();
-	int i;
-
-	if (lc->thousands_sep != NULL)
-		for (i = 0; lc->thousands_sep[i] != '\0'; i++)
-			OUTCHAR(str, *len, size, lc->thousands_sep[i]);
-	else
-#endif	/* HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP */
-		OUTCHAR(str, *len, size, ',');
-}
-
-static int
-getnumsep(int digits)
-{
-	int separators = (digits - ((digits % 3 == 0) ? 1 : 0)) / 3;
-#if HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP
-	int strln;
-	struct lconv *lc = localeconv();
-
-	/* We support an arbitrary separator length (including zero). */
-	if (lc->thousands_sep != NULL) {
-		for (strln = 0; lc->thousands_sep[strln] != '\0'; strln++)
-			continue;
-		separators *= strln;
-	}
-#endif	/* HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP */
-	return separators;
-}
-
-static int
-getexponent(LDOUBLE value)
-{
-	LDOUBLE tmp = (value >= 0.0) ? value : -value;
-	int exponent = 0;
-
-	/*
-	 * We check for 99 > exponent > -99 in order to work around possible
-	 * endless loops which could happen (at least) in the second loop (at
-	 * least) if we're called with an infinite value.  However, we checked
-	 * for infinity before calling this function using our ISINF() macro, so
-	 * this might be somewhat paranoid.
-	 */
-	while (tmp < 1.0 && tmp > 0.0 && --exponent > -99)
-		tmp *= 10;
-	while (tmp >= 10.0 && ++exponent < 99)
-		tmp /= 10;
-
-	return exponent;
-}
-
-static int
-convert(UINTMAX_T value, char *buf, size_t size, int base, int caps)
-{
-	const char *digits = caps ? "0123456789ABCDEF" : "0123456789abcdef";
-	size_t pos = 0;
-
-	/* We return an unterminated buffer with the digits in reverse order. */
-	do {
-		buf[pos++] = digits[value % base];
-		value /= base;
-	} while (value != 0 && pos < size);
-
-	return (int)pos;
-}
-
-static UINTMAX_T
-cast(LDOUBLE value)
-{
-	UINTMAX_T result;
-
-	/*
-	 * We check for ">=" and not for ">" because if UINTMAX_MAX cannot be
-	 * represented exactly as an LDOUBLE value (but is less than LDBL_MAX),
-	 * it may be increased to the nearest higher representable value for the
-	 * comparison (cf. C99: 6.3.1.4, 2).  It might then equal the LDOUBLE
-	 * value although converting the latter to UINTMAX_T would overflow.
-	 */
-	if (value >= UINTMAX_MAX)
-		return UINTMAX_MAX;
-
-	result = (UINTMAX_T)value;
-	/*
-	 * At least on NetBSD/sparc64 3.0.2 and 4.99.30, casting long double to
-	 * an integer type converts e.g. 1.9 to 2 instead of 1 (which violates
-	 * the standard).  Sigh.
-	 */
-	return (result <= value) ? result : result - 1;
-}
-
-static UINTMAX_T
-myround(LDOUBLE value)
-{
-	UINTMAX_T intpart = cast(value);
-
-	return ((value -= intpart) < 0.5) ? intpart : intpart + 1;
-}
-
-static LDOUBLE
-mypow10(int exponent)
-{
-	LDOUBLE result = 1;
-
-	while (exponent > 0) {
-		result *= 10;
-		exponent--;
-	}
-	while (exponent < 0) {
-		result /= 10;
-		exponent++;
-	}
-	return result;
-}
-#endif	/* !HAVE_VSNPRINTF */
-
-#if !HAVE_VASPRINTF
-#if NEED_MYMEMCPY
-void *
-mymemcpy(void *dst, void *src, size_t len)
-{
-	const char *from = src;
-	char *to = dst;
-
-	/* No need for optimization, we use this only to replace va_copy(3). */
-	while (len-- > 0)
-		*to++ = *from++;
-	return dst;
-}
-#endif	/* NEED_MYMEMCPY */
-
-int
-util_vasprintf(char **ret, const char *format, va_list ap)
-{
-	size_t size;
-	int len;
-	va_list aq;
-
-	VA_COPY(aq, ap);
-	len = vsnprintf(NULL, 0, format, aq);
-	VA_END_COPY(aq);
-	if (len < 0 || (*ret = malloc(size = len + 1)) == NULL)
-		return -1;
-	return vsnprintf(*ret, size, format, ap);
-}
-#endif	/* !HAVE_VASPRINTF */
-
-#if !HAVE_SNPRINTF
-#if HAVE_STDARG_H
-int
-util_snprintf(char *str, size_t size, const char *format, ...)
-#else
-int
-util_snprintf(va_alist) va_dcl
-#endif	/* HAVE_STDARG_H */
-{
-#if !HAVE_STDARG_H
-	char *str;
-	size_t size;
-	char *format;
-#endif	/* HAVE_STDARG_H */
-	va_list ap;
-	int len;
-
-	VA_START(ap, format);
-	VA_SHIFT(ap, str, char *);
-	VA_SHIFT(ap, size, size_t);
-	VA_SHIFT(ap, format, const char *);
-	len = vsnprintf(str, size, format, ap);
-	va_end(ap);
-	return len;
-}
-#endif	/* !HAVE_SNPRINTF */
-
-#if !HAVE_ASPRINTF
-#if HAVE_STDARG_H
-int
-util_asprintf(char **ret, const char *format, ...)
-#else
-int
-util_asprintf(va_alist) va_dcl
-#endif	/* HAVE_STDARG_H */
-{
-#if !HAVE_STDARG_H
-	char **ret;
-	char *format;
-#endif	/* HAVE_STDARG_H */
-	va_list ap;
-	int len;
-
-	VA_START(ap, format);
-	VA_SHIFT(ap, ret, char **);
-	VA_SHIFT(ap, format, const char *);
-	len = vasprintf(ret, format, ap);
-	va_end(ap);
-	return len;
-}
-#endif	/* !HAVE_ASPRINTF */
-#else	/* Dummy declaration to avoid empty translation unit warnings. */
-int main(void);
-#endif	/* !HAVE_SNPRINTF || !HAVE_VSNPRINTF || !HAVE_ASPRINTF || [...] */
-
-
-/* vim: set joinspaces textwidth=80: */
diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h
index 7f80fc12700..5afb7d9a920 100644
--- a/src/gallium/auxiliary/util/u_split_prim.h
+++ b/src/gallium/auxiliary/util/u_split_prim.h
@@ -23,7 +23,7 @@ struct util_split_prim {
    uint edgeflag_off:1;
 };
 
-static INLINE void
+static inline void
 util_split_prim_init(struct util_split_prim *s,
                   unsigned mode, unsigned start, unsigned count)
 {
@@ -41,7 +41,7 @@ util_split_prim_init(struct util_split_prim *s,
    s->repeat_first = 0;
 }
 
-static INLINE boolean
+static inline boolean
 util_split_prim_next(struct util_split_prim *s, unsigned max_verts)
 {
    int repeat = 0;
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index d4f51912a2d..7f8e5a1a3cf 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -51,7 +51,7 @@ union m128i {
    uint ui[4];
 };
 
-static INLINE void u_print_epi8(const char *name, __m128i r)
+static inline void u_print_epi8(const char *name, __m128i r)
 {
    union { __m128i m; ubyte ub[16]; } u;
    u.m = r;
@@ -80,7 +80,7 @@ static INLINE void u_print_epi8(const char *name, __m128i r)
                 u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
 }
 
-static INLINE void u_print_epi16(const char *name, __m128i r)
+static inline void u_print_epi16(const char *name, __m128i r)
 {
    union { __m128i m; ushort us[8]; } u;
    u.m = r;
@@ -99,7 +99,7 @@ static INLINE void u_print_epi16(const char *name, __m128i r)
                 u.us[4],  u.us[5],  u.us[6],  u.us[7]);
 }
 
-static INLINE void u_print_epi32(const char *name, __m128i r)
+static inline void u_print_epi32(const char *name, __m128i r)
 {
    union { __m128i m; uint ui[4]; } u;
    u.m = r;
@@ -113,7 +113,7 @@ static INLINE void u_print_epi32(const char *name, __m128i r)
                 u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
 }
 
-static INLINE void u_print_ps(const char *name, __m128 r)
+static inline void u_print_ps(const char *name, __m128 r)
 {
    union { __m128 m; float f[4]; } u;
    u.m = r;
@@ -179,7 +179,7 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
  * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
  * dependency at this point.
  */
-static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+static inline __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 {
    __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
    __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
@@ -204,7 +204,7 @@ static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 }
 
 
-static INLINE void
+static inline void
 transpose4_epi32(const __m128i * restrict a,
                  const __m128i * restrict b,
                  const __m128i * restrict c,
diff --git a/src/gallium/auxiliary/util/u_string.h b/src/gallium/auxiliary/util/u_string.h
index dc89c4400bc..f7ab09c8f1c 100644
--- a/src/gallium/auxiliary/util/u_string.h
+++ b/src/gallium/auxiliary/util/u_string.h
@@ -35,13 +35,14 @@
 #ifndef U_STRING_H_
 #define U_STRING_H_
 
-#if !defined(_MSC_VER) && !defined(XF86_LIBC_H)
+#if !defined(XF86_LIBC_H)
 #include <stdio.h>
 #endif
 #include <stddef.h>
 #include <stdarg.h>
 
 #include "pipe/p_compiler.h"
+#include "util/macros.h" // PRINTFLIKE
 
 
 #ifdef __cplusplus
@@ -54,7 +55,7 @@ extern "C" {
 
 #else
 
-static INLINE char *
+static inline char *
 util_strchrnul(const char *s, char c)
 {
    for (; *s && *s != c; ++s);
@@ -64,18 +65,44 @@ util_strchrnul(const char *s, char c)
 
 #endif
 
-#ifdef _MSC_VER
+#ifdef _WIN32
 
-int util_vsnprintf(char *, size_t, const char *, va_list);
-int util_snprintf(char *str, size_t size, const char *format, ...);
+static inline int
+util_vsnprintf(char *str, size_t size, const char *format, va_list ap)
+{
+   /* We need to use _vscprintf to calculate the length as vsnprintf returns -1
+    * if the number of characters to write is greater than count.
+    */
+   va_list ap_copy;
+   int ret;
+   va_copy(ap_copy, ap);
+   ret = _vsnprintf(str, size, format, ap);
+   if (ret < 0) {
+      ret = _vscprintf(format, ap_copy);
+   }
+   return ret;
+}
 
-static INLINE void
+static inline int
+   PRINTFLIKE(3, 4)
+util_snprintf(char *str, size_t size, const char *format, ...)
+{
+   va_list ap;
+   int ret;
+   va_start(ap, format);
+   ret = util_vsnprintf(str, size, format, ap);
+   va_end(ap);
+   return ret;
+}
+
+static inline void
 util_vsprintf(char *str, const char *format, va_list ap)
 {
    util_vsnprintf(str, (size_t)-1, format, ap);
 }
 
-static INLINE void
+static inline void
+   PRINTFLIKE(2, 3)
 util_sprintf(char *str, const char *format, ...)
 {
    va_list ap;
@@ -84,7 +111,7 @@ util_sprintf(char *str, const char *format, ...)
    va_end(ap);
 }
 
-static INLINE char *
+static inline char *
 util_strchr(const char *s, char c)
 {
    char *p = util_strchrnul(s, c);
@@ -92,7 +119,7 @@ util_strchr(const char *s, char c)
    return *p ? p : NULL;
 }
 
-static INLINE char*
+static inline char*
 util_strncat(char *dst, const char *src, size_t n)
 {
    char *p = dst + strlen(dst);
@@ -106,7 +133,7 @@ util_strncat(char *dst, const char *src, size_t n)
    return dst;
 }
 
-static INLINE int
+static inline int
 util_strcmp(const char *s1, const char *s2)
 {
    unsigned char u1, u2;
@@ -122,7 +149,7 @@ util_strcmp(const char *s1, const char *s2)
    return 0;
 }
 
-static INLINE int
+static inline int
 util_strncmp(const char *s1, const char *s2, size_t n)
 {
    unsigned char u1, u2;
@@ -138,7 +165,7 @@ util_strncmp(const char *s1, const char *s2, size_t n)
    return 0;
 }
 
-static INLINE char *
+static inline char *
 util_strstr(const char *haystack, const char *needle)
 {
    const char *p = haystack;
@@ -152,7 +179,7 @@ util_strstr(const char *haystack, const char *needle)
    return NULL;
 }
 
-static INLINE void *
+static inline void *
 util_memmove(void *dest, const void *src, size_t n)
 {
    char *p = (char *)dest;
@@ -199,7 +226,7 @@ struct util_strbuf
 };
 
 
-static INLINE void
+static inline void
 util_strbuf_init(struct util_strbuf *sbuf, char *str, size_t size) 
 {
    sbuf->str = str;
@@ -209,7 +236,7 @@ util_strbuf_init(struct util_strbuf *sbuf, char *str, size_t size)
 }
 
 
-static INLINE void
+static inline void
 util_strbuf_printf(struct util_strbuf *sbuf, const char *format, ...)
 {
    if(sbuf->left > 1) {
diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h
index 1605215cb88..b84694c540b 100644
--- a/src/gallium/auxiliary/util/u_surfaces.h
+++ b/src/gallium/auxiliary/util/u_surfaces.h
@@ -50,7 +50,7 @@ util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size,
                      struct pipe_surface **res);
 
 /* fast inline path for the very common case */
-static INLINE boolean
+static inline boolean
 util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size,
                   struct pipe_context *ctx, struct pipe_resource *pt,
                   unsigned level, unsigned layer,
@@ -70,7 +70,7 @@ util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size,
    return util_surfaces_do_get(us, surface_struct_size, ctx, pt, level, layer, res);
 }
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 util_surfaces_peek(struct util_surfaces *us, struct pipe_resource *pt, unsigned level, unsigned layer)
 {
    if(!us->u.pv)
@@ -84,7 +84,7 @@ util_surfaces_peek(struct util_surfaces *us, struct pipe_resource *pt, unsigned
 
 void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps);
 
-static INLINE void
+static inline void
 util_surfaces_detach(struct util_surfaces *us, struct pipe_surface *ps)
 {
    if(likely(ps->texture->target == PIPE_TEXTURE_2D || ps->texture->target == PIPE_TEXTURE_RECT))
diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h
index a33d7f7722b..dc1f568a8e5 100644
--- a/src/gallium/auxiliary/util/u_tile.h
+++ b/src/gallium/auxiliary/util/u_tile.h
@@ -42,7 +42,7 @@ struct pipe_transfer;
  *
  * \return TRUE if tile is totally clipped, FALSE otherwise
  */
-static INLINE boolean
+static inline boolean
 u_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_box *box)
 {
    if ((int) x >= box->width)
diff --git a/src/gallium/auxiliary/util/u_time.h b/src/gallium/auxiliary/util/u_time.h
index 2bee1e00014..a5017d6bce2 100644
--- a/src/gallium/auxiliary/util/u_time.h
+++ b/src/gallium/auxiliary/util/u_time.h
@@ -60,7 +60,7 @@ struct util_time
    
 
 PIPE_DEPRECATED
-static INLINE void
+static inline void
 util_time_get(struct util_time *t)
 {
    t->counter = os_time_get();
@@ -71,7 +71,7 @@ util_time_get(struct util_time *t)
  * Return t2 = t1 + usecs
  */
 PIPE_DEPRECATED
-static INLINE void
+static inline void
 util_time_add(const struct util_time *t1,
               int64_t usecs,
               struct util_time *t2)
@@ -84,7 +84,7 @@ util_time_add(const struct util_time *t1,
  * Return difference between times, in microseconds
  */
 PIPE_DEPRECATED
-static INLINE int64_t
+static inline int64_t
 util_time_diff(const struct util_time *t1, 
                const struct util_time *t2)
 {
@@ -98,7 +98,7 @@ util_time_diff(const struct util_time *t1,
  * Not publicly available because it does not take in account wrap-arounds.
  * Use util_time_timeout instead.
  */
-static INLINE int
+static inline int
 _util_time_compare(const struct util_time *t1,
                    const struct util_time *t2)
 {
@@ -115,7 +115,7 @@ _util_time_compare(const struct util_time *t1,
  * Returns non-zero when the timeout expires.
  */
 PIPE_DEPRECATED
-static INLINE boolean
+static inline boolean
 util_time_timeout(const struct util_time *start, 
                   const struct util_time *end,
                   const struct util_time *curr)
@@ -128,7 +128,7 @@ util_time_timeout(const struct util_time *start,
  * Return current time in microseconds
  */
 PIPE_DEPRECATED
-static INLINE int64_t
+static inline int64_t
 util_time_micros(void)
 {
    return os_time_get();
@@ -136,7 +136,7 @@ util_time_micros(void)
 
 
 PIPE_DEPRECATED
-static INLINE void
+static inline void
 util_time_sleep(int64_t usecs)
 {
    os_time_sleep(usecs);
diff --git a/src/gallium/auxiliary/util/u_transfer.c b/src/gallium/auxiliary/util/u_transfer.c
index 71da35d6d39..4cb524d5cb1 100644
--- a/src/gallium/auxiliary/util/u_transfer.c
+++ b/src/gallium/auxiliary/util/u_transfer.c
@@ -90,7 +90,7 @@ void u_default_transfer_unmap( struct pipe_context *pipe,
 }
 
 
-static INLINE struct u_resource *
+static inline struct u_resource *
 u_resource( struct pipe_resource *res )
 {
    return (struct u_resource *)res;
diff --git a/src/gallium/auxiliary/util/u_video.h b/src/gallium/auxiliary/util/u_video.h
index b4743d13fbf..ddc00216105 100644
--- a/src/gallium/auxiliary/util/u_video.h
+++ b/src/gallium/auxiliary/util/u_video.h
@@ -40,7 +40,7 @@ extern "C" {
 #include "util/u_debug.h"
 #include "util/u_math.h"
 
-static INLINE enum pipe_video_format
+static inline enum pipe_video_format
 u_reduce_video_profile(enum pipe_video_profile profile)
 {
    switch (profile)
@@ -68,12 +68,19 @@ u_reduce_video_profile(enum pipe_video_profile profile)
       case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH444:
          return PIPE_VIDEO_FORMAT_MPEG4_AVC;
 
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_12:
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_444:
+         return PIPE_VIDEO_FORMAT_HEVC;
+
       default:
          return PIPE_VIDEO_FORMAT_UNKNOWN;
    }
 }
 
-static INLINE void
+static inline void
 u_copy_nv12_to_yv12(void *const *destination_data,
                     uint32_t const *destination_pitches,
                     int src_plane, int src_field,
@@ -99,7 +106,7 @@ u_copy_nv12_to_yv12(void *const *destination_data,
    }
 }
 
-static INLINE void
+static inline void
 u_copy_yv12_to_nv12(void *const *destination_data,
                     uint32_t const *destination_pitches,
                     int src_plane, int src_field,
@@ -122,7 +129,7 @@ u_copy_yv12_to_nv12(void *const *destination_data,
    }
 }
 
-static INLINE void
+static inline void
 u_copy_swap422_packed(void *const *destination_data,
                        uint32_t const *destination_pitches,
                        int src_plane, int src_field,
@@ -147,7 +154,7 @@ u_copy_swap422_packed(void *const *destination_data,
    }
 }
 
-static INLINE uint32_t
+static inline uint32_t
 u_get_h264_level(uint32_t width, uint32_t height, uint32_t *max_reference)
 {
    uint32_t max_dpb_mbs;
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index 69839e61386..afe53063b48 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -538,7 +538,7 @@ cleanup_buffers(struct vl_compositor *c)
    pipe_resource_reference(&c->vertex_buf.buffer, NULL);
 }
 
-static INLINE struct u_rect
+static inline struct u_rect
 default_rect(struct vl_compositor_layer *layer)
 {
    struct pipe_resource *res = layer->sampler_views[0]->texture;
@@ -546,21 +546,21 @@ default_rect(struct vl_compositor_layer *layer)
    return rect;
 }
 
-static INLINE struct vertex2f
+static inline struct vertex2f
 calc_topleft(struct vertex2f size, struct u_rect rect)
 {
    struct vertex2f res = { rect.x0 / size.x, rect.y0 / size.y };
    return res;
 }
 
-static INLINE struct vertex2f
+static inline struct vertex2f
 calc_bottomright(struct vertex2f size, struct u_rect rect)
 {
    struct vertex2f res = { rect.x1 / size.x, rect.y1 / size.y };
    return res;
 }
 
-static INLINE void
+static inline void
 calc_src_and_dst(struct vl_compositor_layer *layer, unsigned width, unsigned height,
                  struct u_rect src, struct u_rect dst)
 {
@@ -658,7 +658,7 @@ gen_rect_verts(struct vertex2f *vb, struct vl_compositor_layer *layer)
    vb[19].y = layer->colors[3].w;
 }
 
-static INLINE struct u_rect
+static inline struct u_rect
 calc_drawn_area(struct vl_compositor_state *s, struct vl_compositor_layer *layer)
 {
    struct vertex2f tl, br;
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
index abb3780f61e..52ce6c416aa 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
@@ -533,7 +533,7 @@ static struct dct_coeff tbl_B14_DC[1 << 17];
 static struct dct_coeff tbl_B14_AC[1 << 17];
 static struct dct_coeff tbl_B15[1 << 17];
 
-static INLINE void
+static inline void
 init_dct_coeff_table(struct dct_coeff *dst, const struct dct_coeff_compressed *src,
                      unsigned size, bool is_DC)
 {
@@ -594,7 +594,7 @@ init_dct_coeff_table(struct dct_coeff *dst, const struct dct_coeff_compressed *s
    }
 }
 
-static INLINE void
+static inline void
 init_tables()
 {
    vl_vlc_init_table(tbl_B1, Elements(tbl_B1), macroblock_address_increment, Elements(macroblock_address_increment));
@@ -611,19 +611,19 @@ init_tables()
    init_dct_coeff_table(tbl_B15, dct_coeff_tbl_one, Elements(dct_coeff_tbl_one), false);
 }
 
-static INLINE int
+static inline int
 DIV2DOWN(int todiv)
 {
    return (todiv&~1)/2;
 }
 
-static INLINE int
+static inline int
 DIV2UP(int todiv)
 {
    return (todiv+1)/2;
 }
 
-static INLINE void
+static inline void
 motion_vector(struct vl_mpg12_bs *bs, int r, int s, int dmv, short delta[2], short dmvector[2])
 {
    int t;
@@ -647,7 +647,7 @@ motion_vector(struct vl_mpg12_bs *bs, int r, int s, int dmv, short delta[2], sho
    }
 }
 
-static INLINE int
+static inline int
 wrap(short f, int shift)
 {
    if (f < (-16 << shift))
@@ -658,7 +658,7 @@ wrap(short f, int shift)
       return f;
 }
 
-static INLINE void
+static inline void
 motion_vector_frame(struct vl_mpg12_bs *bs, int s, struct pipe_mpeg12_macroblock *mb)
 {
    int dmv = mb->macroblock_modes.bits.frame_motion_type == PIPE_MPEG12_MO_TYPE_DUAL_PRIME;
@@ -682,7 +682,7 @@ motion_vector_frame(struct vl_mpg12_bs *bs, int s, struct pipe_mpeg12_macroblock
    }
 }
 
-static INLINE void
+static inline void
 motion_vector_field(struct vl_mpg12_bs *bs, int s, struct pipe_mpeg12_macroblock *mb)
 {
    int dmv = mb->macroblock_modes.bits.field_motion_type == PIPE_MPEG12_MO_TYPE_DUAL_PRIME;
@@ -701,12 +701,12 @@ motion_vector_field(struct vl_mpg12_bs *bs, int s, struct pipe_mpeg12_macroblock
    }
 }
 
-static INLINE void
+static inline void
 reset_predictor(struct vl_mpg12_bs *bs) {
    bs->pred_dc[0] = bs->pred_dc[1] = bs->pred_dc[2] = 0;
 }
 
-static INLINE void
+static inline void
 decode_dct(struct vl_mpg12_bs *bs, struct pipe_mpeg12_macroblock *mb, int scale)
 {
    static const unsigned blk2cc[] = { 0, 0, 0, 0, 1, 2 };
@@ -805,7 +805,7 @@ entry:
       vl_vlc_eatbits(&bs->vlc, 1);
 }
 
-static INLINE void
+static inline void
 decode_slice(struct vl_mpg12_bs *bs, struct pipe_video_buffer *target)
 {
    struct pipe_mpeg12_macroblock mb;
@@ -929,6 +929,7 @@ decode_slice(struct vl_mpg12_bs *bs, struct pipe_video_buffer *target)
          mb.PMV[1][0][0] = mb.PMV[0][0][0];
          mb.PMV[1][0][1] = mb.PMV[0][0][1];
          assert(extra);
+         (void) extra;
       } else if (mb.macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA ||
                 !(mb.macroblock_type & (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD |
                                         PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD))) {
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index 8579460e070..b7009837293 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -84,6 +84,9 @@ static const unsigned const_empty_block_mask_420[3][2][2] = {
 
 struct video_buffer_private
 {
+   struct list_head list;
+   struct pipe_video_buffer *video_buffer;
+
    struct pipe_sampler_view *sampler_view_planes[VL_NUM_COMPONENTS];
    struct pipe_surface      *surfaces[VL_MAX_SURFACES];
 
@@ -99,6 +102,8 @@ destroy_video_buffer_private(void *private)
    struct video_buffer_private *priv = private;
    unsigned i;
 
+   list_del(&priv->list);
+
    for (i = 0; i < VL_NUM_COMPONENTS; ++i)
       pipe_sampler_view_reference(&priv->sampler_view_planes[i], NULL);
 
@@ -126,6 +131,9 @@ get_video_buffer_private(struct vl_mpeg12_decoder *dec, struct pipe_video_buffer
 
    priv = CALLOC_STRUCT(video_buffer_private);
 
+   list_add(&priv->list, &dec->buffer_privates);
+   priv->video_buffer = buf;
+
    sv = buf->get_sampler_view_planes(buf);
    for (i = 0; i < VL_NUM_COMPONENTS; ++i)
       if (sv[i])
@@ -141,6 +149,18 @@ get_video_buffer_private(struct vl_mpeg12_decoder *dec, struct pipe_video_buffer
    return priv;
 }
 
+static void
+free_video_buffer_privates(struct vl_mpeg12_decoder *dec)
+{
+   struct video_buffer_private *priv, *next;
+
+   LIST_FOR_EACH_ENTRY_SAFE(priv, next, &dec->buffer_privates, list) {
+      struct pipe_video_buffer *buf = priv->video_buffer;
+
+      vl_video_buffer_set_associated_data(buf, &dec->base, NULL, NULL);
+   }
+}
+
 static bool
 init_zscan_buffer(struct vl_mpeg12_decoder *dec, struct vl_mpeg12_buffer *buffer)
 {
@@ -297,7 +317,7 @@ cleanup_mc_buffer(struct vl_mpeg12_buffer *buf)
       vl_mc_cleanup_buffer(&buf->mc[i]);
 }
 
-static INLINE void
+static inline void
 MacroBlockTypeToPipeWeights(const struct pipe_mpeg12_macroblock *mb, unsigned weights[2])
 {
    assert(mb);
@@ -332,7 +352,7 @@ MacroBlockTypeToPipeWeights(const struct pipe_mpeg12_macroblock *mb, unsigned we
    }
 }
 
-static INLINE struct vl_motionvector
+static inline struct vl_motionvector
 MotionVectorToPipe(const struct pipe_mpeg12_macroblock *mb, unsigned vector,
                    unsigned field_select_mask, unsigned weight)
 {
@@ -383,7 +403,7 @@ MotionVectorToPipe(const struct pipe_mpeg12_macroblock *mb, unsigned vector,
    return mv;
 }
 
-static INLINE void
+static inline void
 UploadYcbcrBlocks(struct vl_mpeg12_decoder *dec,
                   struct vl_mpeg12_buffer *buf,
                   const struct pipe_mpeg12_macroblock *mb)
@@ -464,6 +484,8 @@ vl_mpeg12_destroy(struct pipe_video_codec *decoder)
 
    assert(decoder);
 
+   free_video_buffer_privates(dec);
+
    /* Asserted in softpipe_delete_fs_state() for some reason */
    dec->context->bind_vs_state(dec->context, NULL);
    dec->context->bind_fs_state(dec->context, NULL);
@@ -1187,6 +1209,8 @@ vl_create_mpeg12_decoder(struct pipe_context *context,
    if (!init_pipe_state(dec))
       goto error_pipe_state;
 
+   list_inithead(&dec->buffer_privates);
+
    return &dec->base;
 
 error_pipe_state:
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.h b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.h
index 2a604054387..505dd675f66 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.h
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.h
@@ -30,6 +30,8 @@
 
 #include "pipe/p_video_codec.h"
 
+#include "util/list.h"
+
 #include "vl_mpeg12_bitstream.h"
 #include "vl_zscan.h"
 #include "vl_idct.h"
@@ -77,6 +79,8 @@ struct vl_mpeg12_decoder
 
    unsigned current_buffer;
    struct vl_mpeg12_buffer *dec_buffers[4];
+
+   struct list_head buffer_privates;
 };
 
 struct vl_mpeg12_buffer
diff --git a/src/gallium/auxiliary/vl/vl_rbsp.h b/src/gallium/auxiliary/vl/vl_rbsp.h
index 2e3da8e1d28..7867238c49e 100644
--- a/src/gallium/auxiliary/vl/vl_rbsp.h
+++ b/src/gallium/auxiliary/vl/vl_rbsp.h
@@ -48,7 +48,7 @@ struct vl_rbsp {
 /**
  * Initialize the RBSP object
  */
-static INLINE void vl_rbsp_init(struct vl_rbsp *rbsp, struct vl_vlc *nal, unsigned num_bits)
+static inline void vl_rbsp_init(struct vl_rbsp *rbsp, struct vl_vlc *nal, unsigned num_bits)
 {
    unsigned bits_left = vl_vlc_bits_left(nal);
 
@@ -71,7 +71,7 @@ static INLINE void vl_rbsp_init(struct vl_rbsp *rbsp, struct vl_vlc *nal, unsign
 /**
  * Make at least 16 more bits available
  */
-static INLINE void vl_rbsp_fillbits(struct vl_rbsp *rbsp)
+static inline void vl_rbsp_fillbits(struct vl_rbsp *rbsp)
 {
    unsigned valid = vl_vlc_valid_bits(&rbsp->nal);
    unsigned i, bits;
@@ -108,7 +108,7 @@ static INLINE void vl_rbsp_fillbits(struct vl_rbsp *rbsp)
 /**
  * Return an unsigned integer from the first n bits
  */
-static INLINE unsigned vl_rbsp_u(struct vl_rbsp *rbsp, unsigned n)
+static inline unsigned vl_rbsp_u(struct vl_rbsp *rbsp, unsigned n)
 {
    if (n == 0)
       return 0;
@@ -120,7 +120,7 @@ static INLINE unsigned vl_rbsp_u(struct vl_rbsp *rbsp, unsigned n)
 /**
  * Return an unsigned exponential Golomb encoded integer
  */
-static INLINE unsigned vl_rbsp_ue(struct vl_rbsp *rbsp)
+static inline unsigned vl_rbsp_ue(struct vl_rbsp *rbsp)
 {
    unsigned bits = 0;
 
@@ -134,7 +134,7 @@ static INLINE unsigned vl_rbsp_ue(struct vl_rbsp *rbsp)
 /**
  * Return an signed exponential Golomb encoded integer
  */
-static INLINE signed vl_rbsp_se(struct vl_rbsp *rbsp)
+static inline signed vl_rbsp_se(struct vl_rbsp *rbsp)
 {
    signed codeNum = vl_rbsp_ue(rbsp);
    if (codeNum & 1)
@@ -146,7 +146,7 @@ static INLINE signed vl_rbsp_se(struct vl_rbsp *rbsp)
 /**
  * Are more data available in the RBSP ?
  */
-static INLINE bool vl_rbsp_more_data(struct vl_rbsp *rbsp)
+static inline bool vl_rbsp_more_data(struct vl_rbsp *rbsp)
 {
    unsigned bits, value;
 
diff --git a/src/gallium/auxiliary/vl/vl_vlc.h b/src/gallium/auxiliary/vl/vl_vlc.h
index 2f905956dbf..7821b8be0a1 100644
--- a/src/gallium/auxiliary/vl/vl_vlc.h
+++ b/src/gallium/auxiliary/vl/vl_vlc.h
@@ -65,7 +65,7 @@ struct vl_vlc_compressed
 /**
  * initalize and decompress a lookup table
  */
-static INLINE void
+static inline void
 vl_vlc_init_table(struct vl_vlc_entry *dst, unsigned dst_size, const struct vl_vlc_compressed *src, unsigned src_size)
 {
    unsigned i, bits = util_logbase2(dst_size);
@@ -87,7 +87,7 @@ vl_vlc_init_table(struct vl_vlc_entry *dst, unsigned dst_size, const struct vl_v
 /**
  * switch over to next input buffer
  */
-static INLINE void
+static inline void
 vl_vlc_next_input(struct vl_vlc *vlc)
 {
    unsigned len = vlc->sizes[0];
@@ -112,7 +112,7 @@ vl_vlc_next_input(struct vl_vlc *vlc)
 /**
  * align the data pointer to the next dword
  */
-static INLINE void
+static inline void
 vl_vlc_align_data_ptr(struct vl_vlc *vlc)
 {
    /* align the data pointer */
@@ -126,7 +126,7 @@ vl_vlc_align_data_ptr(struct vl_vlc *vlc)
 /**
  * fill the bit buffer, so that at least 32 bits are valid
  */
-static INLINE void
+static inline void
 vl_vlc_fillbits(struct vl_vlc *vlc)
 {
    assert(vlc);
@@ -175,7 +175,7 @@ vl_vlc_fillbits(struct vl_vlc *vlc)
 /**
  * initialize vlc structure and start reading from first input buffer
  */
-static INLINE void
+static inline void
 vl_vlc_init(struct vl_vlc *vlc, unsigned num_inputs,
             const void *const *inputs, const unsigned *sizes)
 {
@@ -203,7 +203,7 @@ vl_vlc_init(struct vl_vlc *vlc, unsigned num_inputs,
 /**
  * number of bits still valid in bit buffer
  */
-static INLINE unsigned
+static inline unsigned
 vl_vlc_valid_bits(struct vl_vlc *vlc)
 {
    return 32 - vlc->invalid_bits;
@@ -212,7 +212,7 @@ vl_vlc_valid_bits(struct vl_vlc *vlc)
 /**
  * number of bits left over all inbut buffers
  */
-static INLINE unsigned
+static inline unsigned
 vl_vlc_bits_left(struct vl_vlc *vlc)
 {
    signed bytes_left = vlc->end - vlc->data;
@@ -223,7 +223,7 @@ vl_vlc_bits_left(struct vl_vlc *vlc)
 /**
  * get num_bits from bit buffer without removing them
  */
-static INLINE unsigned
+static inline unsigned
 vl_vlc_peekbits(struct vl_vlc *vlc, unsigned num_bits)
 {
    assert(vl_vlc_valid_bits(vlc) >= num_bits || vlc->data >= vlc->end);
@@ -233,7 +233,7 @@ vl_vlc_peekbits(struct vl_vlc *vlc, unsigned num_bits)
 /**
  * remove num_bits from bit buffer
  */
-static INLINE void
+static inline void
 vl_vlc_eatbits(struct vl_vlc *vlc, unsigned num_bits)
 {
    assert(vl_vlc_valid_bits(vlc) >= num_bits);
@@ -245,7 +245,7 @@ vl_vlc_eatbits(struct vl_vlc *vlc, unsigned num_bits)
 /**
  * get num_bits from bit buffer with removing them
  */
-static INLINE unsigned
+static inline unsigned
 vl_vlc_get_uimsbf(struct vl_vlc *vlc, unsigned num_bits)
 {
    unsigned value;
@@ -261,7 +261,7 @@ vl_vlc_get_uimsbf(struct vl_vlc *vlc, unsigned num_bits)
 /**
  * treat num_bits as signed value and remove them from bit buffer
  */
-static INLINE signed
+static inline signed
 vl_vlc_get_simsbf(struct vl_vlc *vlc, unsigned num_bits)
 {
    signed value;
@@ -277,7 +277,7 @@ vl_vlc_get_simsbf(struct vl_vlc *vlc, unsigned num_bits)
 /**
  * lookup a value and length in a decompressed table
  */
-static INLINE int8_t
+static inline int8_t
 vl_vlc_get_vlclbf(struct vl_vlc *vlc, const struct vl_vlc_entry *tbl, unsigned num_bits)
 {
    tbl += vl_vlc_peekbits(vlc, num_bits);
@@ -288,7 +288,7 @@ vl_vlc_get_vlclbf(struct vl_vlc *vlc, const struct vl_vlc_entry *tbl, unsigned n
 /**
  * fast forward search for a specific byte value
  */
-static INLINE boolean
+static inline boolean
 vl_vlc_search_byte(struct vl_vlc *vlc, unsigned num_bits, uint8_t value)
 {
    /* make sure we are on a byte boundary */
@@ -345,7 +345,7 @@ vl_vlc_search_byte(struct vl_vlc *vlc, unsigned num_bits, uint8_t value)
 /**
  * remove num_bits bits starting at pos from the bitbuffer
  */
-static INLINE void
+static inline void
 vl_vlc_removebits(struct vl_vlc *vlc, unsigned pos, unsigned num_bits)
 {
    uint64_t lo = (vlc->buffer & (~0UL >> (pos + num_bits))) << num_bits;
@@ -357,7 +357,7 @@ vl_vlc_removebits(struct vl_vlc *vlc, unsigned pos, unsigned num_bits)
 /**
  * limit the number of bits left for fetching
  */
-static INLINE void
+static inline void
 vl_vlc_limit(struct vl_vlc *vlc, unsigned bits_left)
 {
    assert(bits_left <= vl_vlc_bits_left(vlc));
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c
index 7e61b88e6b5..3b1b87f9523 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -37,6 +37,8 @@
 #include <xf86drm.h>
 #include <errno.h>
 
+#include "loader.h"
+
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
@@ -293,6 +295,16 @@ vl_screen_get_private(struct vl_screen *vscreen)
    return vscreen;
 }
 
+static xcb_screen_t *
+get_xcb_screen(xcb_screen_iterator_t iter, int screen)
+{
+    for (; iter.rem; --screen, xcb_screen_next(&iter))
+        if (screen == 0)
+            return iter.data;
+
+    return NULL;
+}
+
 struct vl_screen*
 vl_screen_create(Display *display, int screen)
 {
@@ -334,8 +346,7 @@ vl_screen_create(Display *display, int screen)
       goto free_query;
 
    s = xcb_setup_roots_iterator(xcb_get_setup(scrn->conn));
-   while (screen--)
-	xcb_screen_next(&s);
+
    driverType = XCB_DRI2_DRIVER_TYPE_DRI;
 #ifdef DRI2DriverPrimeShift
    {
@@ -351,7 +362,7 @@ vl_screen_create(Display *display, int screen)
    }
 #endif
 
-   connect_cookie = xcb_dri2_connect_unchecked(scrn->conn, s.data->root, driverType);
+   connect_cookie = xcb_dri2_connect_unchecked(scrn->conn, get_xcb_screen(s, screen)->root, driverType);
    connect = xcb_dri2_connect_reply(scrn->conn, connect_cookie, NULL);
    if (connect == NULL || connect->driver_name_length + connect->device_name_length == 0)
       goto free_connect;
@@ -361,7 +372,7 @@ vl_screen_create(Display *display, int screen)
    if (!device_name)
       goto free_connect;
    memcpy(device_name, xcb_dri2_connect_device_name(connect), device_name_length);
-   fd = open(device_name, O_RDWR);
+   fd = loader_open_device(device_name);
    free(device_name);
 
    if (fd < 0)
@@ -370,7 +381,7 @@ vl_screen_create(Display *display, int screen)
    if (drmGetMagic(fd, &magic))
       goto free_connect;
 
-   authenticate_cookie = xcb_dri2_authenticate_unchecked(scrn->conn, s.data->root, magic);
+   authenticate_cookie = xcb_dri2_authenticate_unchecked(scrn->conn, get_xcb_screen(s, screen)->root, magic);
    authenticate = xcb_dri2_authenticate_reply(scrn->conn, authenticate_cookie, NULL);
 
    if (authenticate == NULL || !authenticate->authenticated)
@@ -379,7 +390,7 @@ vl_screen_create(Display *display, int screen)
 #if GALLIUM_STATIC_TARGETS
    scrn->base.pscreen = dd_create_screen(fd);
 #else
-   if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd, false))
+   if (pipe_loader_drm_probe_fd(&scrn->base.dev, fd))
       scrn->base.pscreen = pipe_loader_create_screen(scrn->base.dev, PIPE_SEARCH_DIR);
 #endif // GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/docs/d3d11ddi.txt b/src/gallium/docs/d3d11ddi.txt
deleted file mode 100644
index a7036481411..00000000000
--- a/src/gallium/docs/d3d11ddi.txt
+++ /dev/null
@@ -1,462 +0,0 @@
-This document compares the D3D10/D3D11 device driver interface with Gallium.
-It is written from the perspective of a developer implementing a D3D10/D3D11 driver as a Gallium state tracker.
-
-Note that naming and other cosmetic differences are not noted, since they don't really matter and would severely clutter the document.
-Gallium/OpenGL terminology is used in preference to D3D terminology.
-
-NOTE: this document tries to be complete but most likely isn't fully complete and also not fully correct: please submit patches if you spot anything incorrect
-
-Also note that this is specifically for the DirectX 10/11 Windows Vista/7 DDI interfaces.
-DirectX 9 has both user-mode (for Vista) and kernel mode (pre-Vista) interfaces, but they are significantly different from Gallium due to the presence of a lot of fixed function functionality.
-
-The user-visible DirectX 10/11 interfaces are distinct from the kernel DDI, but they match very closely.
-
-* Accessing Microsoft documentation
-
-See http://msdn.microsoft.com/en-us/library/dd445501.aspx ("D3D11DDI_DEVICEFUNCS") for D3D documentation.
-
-Also see http://download.microsoft.com/download/f/2/d/f2d5ee2c-b7ba-4cd0-9686-b6508b5479a1/direct3d10_web.pdf ("The Direct3D 10 System" by David Blythe) for an introduction to Direct3D 10 and the rationale for its design.
-
-The Windows Driver Kit contains the actual headers, as well as shader bytecode documentation.
-
-To get the headers from Linux, run the following, in a dedicated directory:
-wget http://download.microsoft.com/download/4/A/2/4A25C7D5-EFBE-4182-B6A9-AE6850409A78/GRMWDK_EN_7600_1.ISO
-sudo mount -o loop GRMWDK_EN_7600_1.ISO /mnt/tmp
-cabextract -x /mnt/tmp/wdk/headers_cab001.cab
-rename 's/^_(.*)_[0-9]*$/$1/' *
-sudo umount /mnt/tmp
-
-d3d10umddi.h contains the DDI interface analyzed in this document: note that it is much easier to read this online on MSDN.
-d3d{10,11}TokenizedProgramFormat.hpp contains the shader bytecode definitions: this is not available on MSDN.
-d3d9types.h contains DX9 shader bytecode, and DX9 types
-d3dumddi.h contains the DirectX 9 DDI interface
-
-* Glossary
-
-BC1: DXT1
-BC2: DXT3
-BC3: DXT5
-BC5: RGTC1
-BC6H: BPTC float
-BC7: BPTC
-CS = compute shader: OpenCL-like shader
-DS = domain shader: tessellation evaluation shader
-HS = hull shader: tessellation control shader
-IA = input assembler: primitive assembly
-Input layout: vertex elements
-OM = output merger: blender
-PS = pixel shader: fragment shader
-Primitive topology: primitive type
-Resource: buffer or texture
-Shader resource (view): sampler view
-SO = stream out: transform feedback
-Unordered access view: view supporting random read/write access (usually from compute shaders)
-
-* Legend
-
--: features D3D11 has and Gallium lacks
-+: features Gallium has and D3D11 lacks
-!: differences between D3D11 and Gallium
-*: possible improvements to Gallium
->: references to comparisons of special enumerations
-#: comment
-
-* Gallium functions with no direct D3D10/D3D11 equivalent
-
-clear
-	+ Gallium supports clearing both render targets and depth/stencil with a single call
-
-fence_signalled
-fence_finish
-	+ D3D10/D3D11 don't appear to support explicit fencing; queries can often substitute though, and flushing is supported
-
-set_clip_state
-	+ Gallium supports fixed function user clip planes, D3D10/D3D11 only support using the vertex shader for them
-
-set_polygon_stipple
-	+ Gallium supports polygon stipple
-
-clearRT/clearDS
-	+ Gallium supports subrectangle fills of surfaces, D3D10 only supports full clears of views
-
-* DirectX 10/11 DDI functions and Gallium equivalents
-
-AbandonCommandList (D3D11 only)
-	- Gallium does not support deferred contexts
-
-CalcPrivateBlendStateSize
-CalcPrivateDepthStencilStateSize
-CalcPrivateDepthStencilViewSize
-CalcPrivateElementLayoutSize
-CalcPrivateGeometryShaderWithStreamOutput
-CalcPrivateOpenedResourceSize
-CalcPrivateQuerySize
-CalcPrivateRasterizerStateSize
-CalcPrivateRenderTargetViewSize
-CalcPrivateResourceSize
-CalcPrivateSamplerSize
-CalcPrivateShaderResourceViewSize
-CalcPrivateShaderSize
-CalcDeferredContextHandleSize (D3D11 only)
-CalcPrivateCommandListSize (D3D11 only)
-CalcPrivateDeferredContextSize (D3D11 only)
-CalcPrivateTessellationShaderSize (D3D11 only)
-CalcPrivateUnorderedAccessViewSize (D3D11 only)
-	! D3D11 allocates private objects itself, using the size computed here
-	* Gallium could do something similar to be able to put the private data inline into state tracker objects: this would allow them to fit in the same cacheline and improve performance
-
-CheckDeferredContextHandleSizes (D3D11 only)
-	- Gallium does not support deferred contexts
-
-CheckFormatSupport -> screen->is_format_supported
-	! Gallium passes usages to this function, D3D11 returns them
-	- Gallium does not differentiate between blendable and non-blendable render targets
-	! Gallium includes sample count directly, D3D11 uses additional query 
-
-CheckMultisampleQualityLevels
-	! is merged with is_format_supported
-
-CommandListExecute (D3D11 only)
-	- Gallium does not support command lists
-
-CopyStructureCount (D3D11 only)
-	- Gallium does not support unordered access views (views that can be written to arbitrarily from compute shaders)
-
-ClearDepthStencilView -> clear_depth_stencil
-ClearRenderTargetView -> clear_render_target
-	# D3D11 is not totally clear about whether this applies to any view or only a "currently-bound view"
-	+ Gallium allows to clear both depth/stencil and render target(s) in a single operation
-	+ Gallium supports double-precision depth values (but not rgba values!)
-	* May want to also support double-precision rgba or use "float" for "depth"
-
-ClearUnorderedAccessViewFloat (D3D11 only)
-ClearUnorderedAccessViewUint (D3D11 only)
-	- Gallium does not support unordered access views (views that can be written to arbitrarily from compute shaders)
-
-CreateBlendState (extended in D3D10.1) -> create_blend_state
-	# D3D10 does not support per-RT blend modes (but per-RT blending), only D3D10.1 does
-	+ Gallium supports logic ops
-	+ Gallium supports dithering
-	+ Gallium supports using the broadcast alpha component of the blend constant color
-
-CreateCommandList (D3D11 only)
-	- Gallium does not support command lists
-
-CreateComputeShader (D3D11 only)
-	- Gallium does not support compute shaders
-
-CreateDeferredContext (D3D11 only)
-	- Gallium does not support deferred contexts
-
-CreateDomainShader (D3D11 only)
-	- Gallium does not support domain shaders
-
-CreateHullShader (D3D11 only)
-	- Gallium does not support hull shaders
-
-CreateUnorderedAccessView (D3D11 only)
-	- Gallium does not support unordered access views
-
-CreateDepthStencilState -> create_depth_stencil_alpha_state
-	! D3D11 has both a global stencil enable, and front/back enables; Gallium has only front/back enables
-	+ Gallium has per-face writemask/valuemasks, D3D11 uses the same value for back and front
-	+ Gallium supports the alpha test, which D3D11 lacks
-
-CreateDepthStencilView -> create_surface
-CreateRenderTargetView -> create_surface
-	! Gallium merges depthstencil and rendertarget views into pipe_surface
-	- lack of render-to-buffer support
-	+ Gallium supports using 3D texture zslices as a depth/stencil buffer (in theory)
-
-CreateElementLayout -> create_vertex_elements_state
-	! D3D11 allows sparse vertex elements (via InputRegister); in Gallium they must be specified sequentially
-	! D3D11 has an extra flag (InputSlotClass) that is the same as instance_divisor == 0
-
-CreateGeometryShader -> create_gs_state
-CreateGeometryShaderWithStreamOutput -> create_gs_state + create_stream_output_state
-CreatePixelShader -> create_fs_state
-CreateVertexShader -> create_vs_state
-	> bytecode is different (see D3d10tokenizedprogramformat.hpp)
-	! D3D11 describes input/outputs separately from bytecode; Gallium has the tgsi_scan.c module to extract it from TGSI
-	@ TODO: look into DirectX 10/11 semantics specification and bytecode
-
-CheckCounter
-CheckCounterInfo
-CreateQuery -> create_query
-	! D3D11 implements fences with "event" queries
-	* others are performance counters, we may want them but they are not critical
-
-CreateRasterizerState
-	+ Gallium, like OpenGL, supports PIPE_POLYGON_MODE_POINT
-	+ Gallium, like OpenGL, supports per-face polygon fill modes
-	+ Gallium, like OpenGL, supports culling everything
-	+ Gallium, like OpenGL, supports two-side lighting; D3D11 only has the facing attribute
-	+ Gallium, like OpenGL, supports per-fill-mode polygon offset enables
-	+ Gallium, like OpenGL, supports polygon smoothing
-	+ Gallium, like OpenGL, supports polygon stipple
-	+ Gallium, like OpenGL, supports point smoothing
-	+ Gallium, like OpenGL, supports point sprites
-	+ Gallium supports specifying point quad rasterization
-	+ Gallium, like OpenGL, supports per-point point size
-	+ Gallium, like OpenGL, supports line smoothing
-	+ Gallium, like OpenGL, supports line stipple
-	+ Gallium supports line last pixel rule specification
-	+ Gallium, like OpenGL, supports provoking vertex convention
-	+ Gallium supports D3D9 rasterization rules
-	+ Gallium supports fixed line width
-	+ Gallium supports fixed point size
-
-CreateResource -> texture_create or buffer_create
-	! D3D11 passes the dimensions of all mipmap levels to the create call, while Gallium has an implicit floor(x/2) rule
-	# Note that hardware often has the implicit rule, so the D3D11 interface seems to make little sense
-	# Also, the D3D11 API does not allow the user to specify mipmap sizes, so this really seems a dubious decision on Microsoft's part
-	- D3D11 supports specifying initial data to write in the resource
-	- Gallium does not support unordered access buffers
-	! D3D11 specifies mapping flags (i.e. read/write/discard);:it's unclear what they are used for here
-	- D3D11 supports odd things in the D3D10_DDI_RESOURCE_MISC_FLAG enum (D3D10_DDI_RESOURCE_MISC_DISCARD_ON_PRESENT, D3D11_DDI_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS, D3D11_DDI_RESOURCE_MISC_BUFFER_STRUCTURED)
-	- Gallium does not support indirect draw call parameter buffers
-	! D3D11 supports specifying hardware modes and other stuff here for scanout resources
-	! D3D11 implements cube maps as 2D array textures
-
-CreateSampler
-	- D3D11 supports a monochrome convolution filter for "text filtering"
-	+ Gallium supports non-normalized coordinates
-	+ Gallium supports CLAMP, MIRROR_CLAMP and MIRROR_CLAMP_TO_BORDER
-	+ Gallium supports setting min/max/mip filters and anisotropy independently
-
-CreateShaderResourceView (extended in D3D10.1) -> create_sampler_view
-	+ Gallium supports specifying a swizzle
-	! D3D11 implements "cube views" as views into a 2D array texture
-
-CsSetConstantBuffers (D3D11 only)
-CsSetSamplers (D3D11 only)
-CsSetShader (D3D11 only)
-CsSetShaderResources (D3D11 only)
-CsSetShaderWithIfaces (D3D11 only)
-CsSetUnorderedAccessViews (D3D11 only)
-	- Gallium does not support compute shaders
-
-DestroyBlendState
-DestroyCommandList (D3D11 only)
-DestroyDepthStencilState
-DestroyDepthStencilView
-DestroyDevice
-DestroyElementLayout
-DestroyQuery
-DestroyRasterizerState
-DestroyRenderTargetView
-DestroyResource
-DestroySampler
-DestroyShader
-DestroyShaderResourceView
-DestroyUnorderedAccessView (D3D11 only)
-	# these are trivial
-
-Dispatch (D3D11 only)
-	- Gallium does not support compute shaders
-
-DispatchIndirect (D3D11 only)
-	- Gallium does not support compute shaders
-
-Draw -> draw_vbo
-	! D3D11 sets primitive modes separately with IaSetTopology: it's not obvious which is better
-
-DrawAuto -> draw_auto
-
-DrawIndexed -> draw_vbo
-	! D3D11 sets primitive modes separately with IaSetTopology: it's not obvious which is better
-	+ D3D11 lacks explicit range, which is required for OpenGL
-
-DrawIndexedInstanced -> draw_vbo
-	! D3D11 sets primitive modes separately with IaSetTopology: it's not obvious which is better
-
-DrawIndexedInstancedIndirect (D3D11 only)
-	# this allows to use an hardware buffer to specify the parameters for multiple draw_vbo calls
-	- Gallium does not support draw call parameter buffers and indirect draw
-
-DrawInstanced -> draw_vbo
-	! D3D11 sets primitive modes separately with IaSetTopology: it's not obvious which is better
-
-DrawInstancedIndirect (D3D11 only)
-	# this allows to use an hardware buffer to specify the parameters for multiple draw_vbo calls
-	- Gallium does not support draw call parameter buffers and indirect draws
-
-DsSetConstantBuffers (D3D11 only)
-DsSetSamplers (D3D11 only)
-DsSetShader (D3D11 only)
-DsSetShaderResources (D3D11 only)
-DsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support domain shaders
-
-Flush -> flush
-	! Gallium supports fencing, D3D11 just has a dumb glFlush-like function
-
-GenMips
-	- Gallium lacks a mipmap generation interface, and does this manually with the 3D engine
-	* it may be useful to add a mipmap generation interface, since the hardware (especially older cards) may have a better way than using the 3D engine
-
-GsSetConstantBuffers -> for(i = StartBuffer; i < NumBuffers; ++i) set_constant_buffer(PIPE_SHADER_GEOMETRY, i, phBuffers[i])
-
-GsSetSamplers
-	- Gallium does not support sampling in geometry shaders
-
-GsSetShader -> bind_gs_state
-
-GsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support shader interfaces
-
-GsSetShaderResources
-	- Gallium does not support sampling in geometry shaders
-
-HsSetConstantBuffers (D3D11 only)
-HsSetSamplers (D3D11 only)
-HsSetShader (D3D11 only)
-HsSetShaderResources (D3D11 only)
-HsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support hull shaders
-
-IaSetIndexBuffer -> set_index_buffer
-	+ Gallium supports 8-bit indices
-	# the D3D11 interface allows index-size-unaligned byte offsets into the index buffer; most drivers will abort with an assertion
-
-IaSetInputLayout -> bind_vertex_elements_state
-
-IaSetTopology
-	! Gallium passes the topology = primitive type to the draw calls
-	* may want to add an interface for this
-	- Gallium lacks support for DirectX 11 tessellated primitives
-	+ Gallium supports line loops, triangle fans, quads, quad strips and polygons
-
-IaSetVertexBuffers -> set_vertex_buffers
-	- Gallium only allows setting all vertex buffers at once, while D3D11 supports setting a subset
-
-OpenResource -> texture_from_handle
-
-PsSetConstantBuffers -> for(i = StartBuffer; i < NumBuffers; ++i) set_constant_buffer(PIPE_SHADER_FRAGMENT, i, phBuffers[i])
-	* may want to split into fragment/vertex-specific versions
-
-PsSetSamplers -> bind_fragment_sampler_states
-	* may want to allow binding subsets instead of all at once
-
-PsSetShader -> bind_fs_state
-
-PsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support shader interfaces
-
-PsSetShaderResources -> set_sampler_views
-	* may want to allow binding subsets instead of all at once
-
-QueryBegin -> begin_query
-
-QueryEnd -> end_query
-
-QueryGetData -> get_query_result
-	- D3D11 supports reading an arbitrary data chunk for query results, Gallium only supports reading a 64-bit integer
-	+ D3D11 doesn't seem to support actually waiting for the query result (?!)
-	- D3D11 supports optionally not flushing command buffers here and instead returning DXGI_DDI_ERR_WASSTILLDRAWING
-
-RecycleCommandList (D3D11 only)
-RecycleCreateCommandList (D3D11 only)
-RecycleDestroyCommandList (D3D11 only)
-	- Gallium does not support command lists
-
-RecycleCreateDeferredContext (D3D11 only)
-	- Gallium does not support deferred contexts
-
-RelocateDeviceFuncs
-	- Gallium does not support moving pipe_context, while D3D11 seems to, using this
-
-ResetPrimitiveID (D3D10.1+ only, #ifdef D3D10PSGP)
-	# used to do vertex processing on the GPU on Intel G45 chipsets when it is faster this way (see www.intel.com/Assets/PDF/whitepaper/322931.pdf)
-	# presumably this resets the primitive id system value
-	- Gallium does not support vertex pipeline bypass anymore
-
-ResourceCopy
-ResourceCopyRegion
-ResourceConvert (D3D10.1+ only)
-ResourceConvertRegion (D3D10.1+ only)
-	-> resource_copy_region
-
-ResourceIsStagingBusy ->
-	- Gallium lacks this
-	+ Gallium can use fences
-
-ResourceReadAfterWriteHazard
-	- Gallium lacks this
-
-ResourceResolveSubresource -> blit
-
-ResourceMap
-ResourceUnmap
-DynamicConstantBufferMapDiscard
-DynamicConstantBufferUnmap
-DynamicIABufferMapDiscard
-DynamicIABufferMapNoOverwrite
-DynamicIABufferUnmap
-DynamicResourceMapDiscard
-DynamicResourceUnmap
-StagingResourceMap
-StagingResourceUnmap
-	-> transfer functions
-	! Gallium and D3D have different semantics for transfers
-	* D3D separates vertex/index buffers from constant buffers
-	! D3D separates some buffer flags into specialized calls
-
-ResourceUpdateSubresourceUP -> transfer functionality, transfer_inline_write in gallium-resources
-DefaultConstantBufferUpdateSubresourceUP -> transfer functionality, transfer_inline_write in gallium-resources
-
-SetBlendState -> bind_blend_state, set_blend_color and set_sample_mask
-	! D3D11 fuses bind_blend_state, set_blend_color and set_sample_mask in a single function
-
-SetDepthStencilState -> bind_depth_stencil_alpha_state and set_stencil_ref
-	! D3D11 fuses bind_depth_stencil_alpha_state and set_stencil_ref in a single function
-
-SetPredication -> render_condition
-	# here both D3D11 and Gallium seem very limited (hardware is too, probably though)
-	# ideally, we should support nested conditional rendering, as well as more complex tests (checking for an arbitrary range, after an AND with arbitrary mask )
-	# of couse, hardware support is probably as limited as OpenGL/D3D11
-	+ Gallium, like NV_conditional_render, supports by-region and wait flags
-	- D3D11 supports predication conditional on being equal any value (along with occlusion predicates); Gallium only supports on non-zero
-
-SetRasterizerState -> bind_rasterizer_state
-
-SetRenderTargets (extended in D3D11) -> set_framebuffer_state
-	! Gallium passed a width/height here, D3D11 does not
-	! Gallium lacks ClearTargets (but this is redundant and the driver can trivially compute this if desired)
-	- Gallium does not support unordered access views
-	- Gallium does not support geometry shader selection of texture array image / 3D texture zslice
-
-SetResourceMinLOD (D3D11 only) -> pipe_sampler_view::tex::first_level
-
-SetScissorRects
-	- Gallium lacks support for multiple geometry-shader-selectable scissor rectangles D3D11 has
-
-SetTextFilterSize
-	- Gallium lacks support for text filters
-
-SetVertexPipelineOutput (D3D10.1+ only)
-	# used to do vertex processing on the GPU on Intel G45 chipsets when it is faster this way (see www.intel.com/Assets/PDF/whitepaper/322931.pdf)
-	- Gallium does not support vertex pipeline bypass anymore
-
-SetViewports
-	- Gallium lacks support for multiple geometry-shader-selectable viewports D3D11 has
-
-ShaderResourceViewReadAfterWriteHazard
-	- Gallium lacks support for this
-	+ Gallium has texture_barrier
-
-SoSetTargets -> set_stream_output_buffers
-
-VsSetConstantBuffers -> for(i = StartBuffer; i < NumBuffers; ++i) set_constant_buffer(PIPE_SHADER_VERTEX, i, phBuffers[i])
-	* may want to split into fragment/vertex-specific versions
-
-VsSetSamplers -> bind_vertex_sampler_states
-	* may want to allow binding subsets instead of all at once
-
-VsSetShader -> bind_vs_state
-
-VsSetShaderWithIfaces (D3D11 only)
-	- Gallium does not support shader interfaces
-
-VsSetShaderResources  -> set_sampler_views
-	* may want to allow binding subsets instead of all at once
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 0908ee7e058..a7d08d2c7f9 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -131,14 +131,14 @@ from a shader without an associated sampler.  This means that they
 have no support for floating point coordinates, address wrap modes or
 filtering.
 
-Shader resources are specified for all the shader stages at once using
-the ``set_shader_resources`` method.  When binding texture resources,
-the ``level``, ``first_layer`` and ``last_layer`` pipe_surface fields
-specify the mipmap level and the range of layers the texture will be
-constrained to.  In the case of buffers, ``first_element`` and
-``last_element`` specify the range within the buffer that will be used
-by the shader resource.  Writes to a shader resource are only allowed
-when the ``writable`` flag is set.
+There are 2 types of shader resources: buffers and images.
+
+Buffers are specified using the ``set_shader_buffers`` method.
+
+Images are specified using the ``set_shader_images`` method. When binding
+images, the ``level``, ``first_layer`` and ``last_layer`` pipe_image_view
+fields specify the mipmap level and the range of layers the image will be
+constrained to.
 
 Surfaces
 ^^^^^^^^
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 8f64817fe5f..2c0da016d08 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -254,6 +254,19 @@ The integer capabilities:
   and size must be page-aligned.
 * ``PIPE_CAP_DEVICE_RESET_STATUS_QUERY``:
   Whether pipe_context::get_device_reset_status is implemented.
+* ``PIPE_CAP_MAX_SHADER_PATCH_VARYINGS``:
+  How many per-patch outputs and inputs are supported between tessellation
+  control and tessellation evaluation shaders, not counting in TESSINNER and
+  TESSOUTER. The minimum allowed value for OpenGL is 30.
+* ``PIPE_CAP_TEXTURE_FLOAT_LINEAR``: Whether the linear minification and
+  magnification filters are supported with single-precision floating-point
+  textures.
+* ``PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR``: Whether the linear minification and
+  magnification filters are supported with half-precision floating-point
+  textures.
+* ``PIPE_CAP_DEPTH_BOUNDS_TEST``: Whether bounds_test, bounds_min, and
+  bounds_max states of pipe_depth_stencil_alpha_state behave according
+  to the GL_EXT_depth_bounds_test specification.
 
 
 .. _pipe_capf:
@@ -384,6 +397,8 @@ pipe_screen::get_compute_param.
   Value type: ``uint32_t``
 * ``PIPE_COMPUTE_CAP_IMAGES_SUPPORTED``: Whether images are supported
   non-zero means yes, zero means no. Value type: ``uint32_t``
+* ``PIPE_COMPUTE_CAP_SUBGROUP_SIZE``: The size of a basic execution unit in
+  threads. Also known as wavefront size, warp size or SIMD width.
 
 .. _pipe_bind:
 
@@ -424,8 +439,10 @@ resources might be created and handled quite differently.
   process.
 * ``PIPE_BIND_GLOBAL``: A buffer that can be mapped into the global
   address space of a compute program.
-* ``PIPE_BIND_SHADER_RESOURCE``: A buffer or texture that can be
-  bound to the graphics pipeline as a shader resource.
+* ``PIPE_BIND_SHADER_BUFFER``: A buffer without a format that can be bound
+  to a shader and can be used with load, store, and atomic instructions.
+* ``PIPE_BIND_SHADER_IMAGE``: A buffer or texture with a format that can be
+  bound to a shader and can be used with load, store, and atomic instructions.
 * ``PIPE_BIND_COMPUTE_RESOURCE``: A buffer or texture that can be
   bound to the compute program as a shader resource.
 * ``PIPE_BIND_COMMAND_ARGS_BUFFER``: A buffer that may be sourced by the
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 89ca172080e..314c9ca8fa2 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2591,7 +2591,7 @@ Array Declaration
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
 Declarations can optional have an ArrayID attribute which can be referred by
-indirect addressing operands. An ArrayID of zero is reserved and treaded as
+indirect addressing operands. An ArrayID of zero is reserved and treated as
 if no ArrayID is specified.
 
 If an indirect addressing operand refers to a specific declaration by using
@@ -2603,6 +2603,7 @@ not relative to the specified declaration
 If no ArrayID is specified with an indirect addressing operand the whole
 register file might be accessed by this operand. This is strongly discouraged
 and will prevent packing of scalar/vec2 arrays and effective alias analysis.
+This is only legal for TEMP and CONST register files.
 
 Declaration Semantic
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/src/gallium/drivers/freedreno/Android.mk b/src/gallium/drivers/freedreno/Android.mk
index a6712b2c115..ed51835e1fb 100644
--- a/src/gallium/drivers/freedreno/Android.mk
+++ b/src/gallium/drivers/freedreno/Android.mk
@@ -28,7 +28,9 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	$(C_SOURCES) \
 	$(a2xx_SOURCES) \
-	$(a3xx_SOURCES)
+	$(a3xx_SOURCES)	\
+	$(a4xx_SOURCES) \
+	$(ir3_SOURCES)
 
 LOCAL_CFLAGS := \
 	-Wno-packed-bitfield-compat
@@ -37,6 +39,7 @@ LOCAL_C_INCLUDES := \
 	$(LOCAL_PATH)/ir3
 
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_freedreno
+LOCAL_STATIC_LIBRARIES := libmesa_glsl
 LOCAL_MODULE := libmesa_pipe_freedreno
 
 include $(GALLIUM_COMMON_MK)
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index cbf62c6daae..dff95ba5270 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index f4f6b94c1ea..c4516baf2ec 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -8,15 +8,15 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
-Copyright (C) 2013-2014 by the following authors:
+Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 
 Permission is hereby granted, free of charge, to any person obtaining
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
index 7cafcd3747e..3c8d8f7c09f 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h
@@ -39,7 +39,7 @@ struct fd2_blend_stateobj {
 	uint32_t rb_colormask;
 };
 
-static INLINE struct fd2_blend_stateobj *
+static inline struct fd2_blend_stateobj *
 fd2_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd2_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
index a0bf01ffd1f..6089ebc1516 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
@@ -67,7 +67,7 @@ create_solid_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A2XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
@@ -77,7 +77,7 @@ static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = {
 };
 
 static const uint8_t a20x_primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A2XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.h b/src/gallium/drivers/freedreno/a2xx/fd2_context.h
index de845f07a85..74147107930 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.h
@@ -40,7 +40,7 @@ struct fd2_context {
 	struct pipe_resource *solid_vertexbuf;
 };
 
-static INLINE struct fd2_context *
+static inline struct fd2_context *
 fd2_context(struct fd_context *ctx)
 {
 	return (struct fd2_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
index adc0653132b..9e53cd3be75 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h
@@ -43,7 +43,7 @@ struct fd2_rasterizer_stateobj {
 	uint32_t pa_su_sc_mode_cntl;
 };
 
-static INLINE struct fd2_rasterizer_stateobj *
+static inline struct fd2_rasterizer_stateobj *
 fd2_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd2_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
index 4fffa08b3c3..5c9236851bd 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h
@@ -42,7 +42,7 @@ struct fd2_sampler_stateobj {
 	uint32_t tex0, tex3, tex4, tex5;
 };
 
-static INLINE struct fd2_sampler_stateobj *
+static inline struct fd2_sampler_stateobj *
 fd2_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd2_sampler_stateobj *)samp;
@@ -54,7 +54,7 @@ struct fd2_pipe_sampler_view {
 	uint32_t tex0, tex2, tex3;
 };
 
-static INLINE struct fd2_pipe_sampler_view *
+static inline struct fd2_pipe_sampler_view *
 fd2_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd2_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
index dda1e552174..15609ad0267 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h
@@ -44,7 +44,7 @@ struct fd2_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd2_zsa_stateobj *
+static inline struct fd2_zsa_stateobj *
 fd2_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd2_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index a3bc74eda85..8e8cf6a03f2 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
index 4f6eeb74481..142df7c300f 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h
@@ -32,6 +32,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 struct fd3_blend_stateobj {
 	struct pipe_blend_state base;
 	struct {
@@ -42,10 +44,10 @@ struct fd3_blend_stateobj {
 		/* Blend control bits for alpha channel */
 		uint32_t blend_control_alpha;
 		uint32_t control;
-	} rb_mrt[4];
+	} rb_mrt[A3XX_MAX_RENDER_TARGETS];
 };
 
-static INLINE struct fd3_blend_stateobj *
+static inline struct fd3_blend_stateobj *
 fd3_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd3_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index 7e5a99ea571..dc33783e398 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c
@@ -88,7 +88,7 @@ create_blit_texcoord_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A3XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
@@ -121,6 +121,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv)
 	fd3_gmem_init(pctx);
 	fd3_texture_init(pctx);
 	fd3_prog_init(pctx);
+	fd3_emit_init(pctx);
 
 	pctx = fd_context_init(&fd3_ctx->base, pscreen, primtypes, priv);
 	if (!pctx)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
index 77e4605e550..6e20b2ff9bc 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -112,7 +112,7 @@ struct fd3_context {
 	struct ir3_shader_key last_key;
 };
 
-static INLINE struct fd3_context *
+static inline struct fd3_context *
 fd3_context(struct fd_context *ctx)
 {
 	return (struct fd3_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index b5838b58eb2..a9498835011 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -60,6 +60,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	const struct pipe_draw_info *info = emit->info;
 	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
 
+	if (!(fd3_emit_get_vp(emit) && fd3_emit_get_fp(emit)))
+		return;
+
 	fd3_emit_state(ctx, ring, emit);
 
 	if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE))
@@ -79,8 +82,8 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			info->restart_index : 0xffffffff);
 
 	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
-		info->mode == PIPE_PRIM_POINTS)
-		primtype = DI_PT_POINTLIST_A2XX;
+			(info->mode == PIPE_PRIM_POINTS))
+		primtype = DI_PT_POINTLIST_PSIZE;
 
 	fd_draw_emit(ctx, ring,
 			primtype,
@@ -240,10 +243,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 		.vtx  = &fd3_ctx->solid_vbuf_state,
 		.prog = &ctx->solid_prog,
 		.key = {
-			.half_precision = (fd3_half_precision(pfb->cbufs[0]) &&
-							   fd3_half_precision(pfb->cbufs[1]) &&
-							   fd3_half_precision(pfb->cbufs[2]) &&
-							   fd3_half_precision(pfb->cbufs[3])),
+			.half_precision = fd_half_precision(pfb),
 		},
 	};
 
@@ -321,7 +321,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 				A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP));
 	}
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) {
 		OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) |
 				A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_ALWAYS) |
@@ -342,7 +342,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 
 	fd3_emit_vertex_bufs(ring, &emit);
 
-	fd3_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL);
+	fd3_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL);
 
 	OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
 	OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) |
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 07cc2266d08..752e7f88cb9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -43,19 +43,26 @@
 #include "fd3_format.h"
 #include "fd3_zsa.h"
 
+static const enum adreno_state_block sb[] = {
+	[SHADER_VERTEX]   = SB_VERT_SHADER,
+	[SHADER_FRAGMENT] = SB_FRAG_SHADER,
+};
+
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
  */
 void
-fd3_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+fd3_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
 	uint32_t i, sz;
 	enum adreno_state_src src;
 
+	debug_assert((regid % 4) == 0);
+	debug_assert((sizedwords % 4) == 0);
+
 	if (prsc) {
 		sz = 0;
 		src = SS_INDIRECT;
@@ -67,7 +74,7 @@ fd3_emit_constant(struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) |
 			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
 			CP_LOAD_STATE_0_NUM_UNIT(sizedwords/2));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
@@ -84,89 +91,31 @@ fd3_emit_constant(struct fd_ringbuffer *ring,
 }
 
 static void
-emit_constants(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
-		struct fd_constbuf_stateobj *constbuf,
-		struct ir3_shader_variant *shader,
-		bool emit_immediates)
+fd3_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+		uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets)
 {
-	uint32_t enabled_mask = constbuf->enabled_mask;
-	uint32_t max_const;
-	int i;
+	uint32_t i;
 
-	// XXX TODO only emit dirty consts.. but we need to keep track if
-	// they are clobbered by a clear, gmem2mem, or mem2gmem..
-	constbuf->dirty_mask = enabled_mask;
+	debug_assert((regid % 4) == 0);
+	debug_assert((num % 4) == 0);
 
-	/* in particular, with binning shader we may end up with unused
-	 * consts, ie. we could end up w/ constlen that is smaller
-	 * than first_immediate.  In that case truncate the user consts
-	 * early to avoid HLSQ lockup caused by writing too many consts
-	 */
-	max_const = MIN2(shader->first_driver_param, shader->constlen);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) |
+			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
+			CP_LOAD_STATE_0_NUM_UNIT(num/2));
+	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
 
-	/* emit user constants: */
-	if (enabled_mask & 1) {
-		const unsigned index = 0;
-		struct pipe_constant_buffer *cb = &constbuf->cb[index];
-		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
-
-		// I expect that size should be a multiple of vec4's:
-		assert(size == align(size, 4));
-
-		/* and even if the start of the const buffer is before
-		 * first_immediate, the end may not be:
-		 */
-		size = MIN2(size, 4 * max_const);
-
-		if (size && constbuf->dirty_mask & (1 << index)) {
-			fd3_emit_constant(ring, sb, 0,
-							  cb->buffer_offset, size,
-							  cb->user_buffer, cb->buffer);
-			constbuf->dirty_mask &= ~(1 << index);
-		}
-
-		enabled_mask &= ~(1 << index);
-	}
-
-	if (shader->constlen > shader->first_driver_param) {
-		uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param);
-		/* emit ubos: */
-		OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4);
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param * 2) |
-				 CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				 CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				 CP_LOAD_STATE_0_NUM_UNIT(params * 2));
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				 CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
-
-		for (i = 1; i <= params * 4; i++) {
-			struct pipe_constant_buffer *cb = &constbuf->cb[i];
-			assert(!cb->user_buffer);
-			if ((enabled_mask & (1 << i)) && cb->buffer)
-				OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0);
-			else
-				OUT_RING(ring, 0xbad00000 | ((i - 1) << 16));
-		}
-	}
-
-	/* emit shader immediates: */
-	if (shader && emit_immediates) {
-		int size = shader->immediates_count;
-		uint32_t base = shader->first_immediate;
-
-		/* truncate size to avoid writing constants that shader
-		 * does not use:
-		 */
-		size = MIN2(size + base, shader->constlen) - base;
-
-		/* convert out of vec4: */
-		base *= 4;
-		size *= 4;
-
-		if (size > 0) {
-			fd3_emit_constant(ring, sb, base,
-				0, size, shader->immediates[0].val, NULL);
+	for (i = 0; i < num; i++) {
+		if (bos[i]) {
+			if (write) {
+				OUT_RELOCW(ring, bos[i], offsets[i], 0, 0);
+			} else {
+				OUT_RELOC(ring, bos[i], offsets[i], 0, 0);
+			}
+		} else {
+			OUT_RING(ring, 0xbad00000 | (i << 16));
 		}
 	}
 }
@@ -302,14 +251,15 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
 		for (i = 0; i < tex->num_textures; i++) {
 			static const struct fd3_pipe_sampler_view dummy_view = {
+					.base.target = PIPE_TEXTURE_1D, /* anything !PIPE_BUFFER */
 					.base.u.tex.first_level = 1,
 			};
 			const struct fd3_pipe_sampler_view *view = tex->textures[i] ?
 					fd3_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
 			struct fd_resource *rsc = fd_resource(view->base.texture);
-			unsigned start = view->base.u.tex.first_level;
-			unsigned end   = view->base.u.tex.last_level;
+			unsigned start = fd_sampler_first_level(&view->base);
+			unsigned end   = fd_sampler_last_level(&view->base);;
 
 			for (j = 0; j < (end - start + 1); j++) {
 				struct fd_resource_slice *slice =
@@ -392,6 +342,7 @@ fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring,
 			format = fd3_gmem_restore_format(rsc->base.b.format);
 		}
 
+		/* note: PIPE_BUFFER disallowed for surfaces */
 		unsigned lvl = psurf[i]->u.tex.level;
 		struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
 
@@ -444,7 +395,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 	uint32_t total_in = 0;
 	const struct fd_vertex_state *vtx = emit->vtx;
 	struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
-	unsigned vertex_regid = regid(63, 0), instance_regid = regid(63, 0);
+	unsigned vertex_regid = regid(63, 0);
+	unsigned instance_regid = regid(63, 0);
+	unsigned vtxcnt_regid = regid(63, 0);
 
 	for (i = 0; i < vp->inputs_count; i++) {
 		uint8_t semantic = sem2name(vp->inputs[i].semantic);
@@ -452,14 +405,17 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 			vertex_regid = vp->inputs[i].regid;
 		else if (semantic == TGSI_SEMANTIC_INSTANCEID)
 			instance_regid = vp->inputs[i].regid;
+		else if (semantic == IR3_SEMANTIC_VTXCNT)
+			vtxcnt_regid = vp->inputs[i].regid;
 		else if (i < vtx->vtx->num_elements && vp->inputs[i].compmask)
 			last = i;
 	}
 
 	/* hw doesn't like to be configured for zero vbo's, it seems: */
-	if (vtx->vtx->num_elements == 0 &&
-		vertex_regid == regid(63, 0) &&
-		instance_regid == regid(63, 0))
+	if ((vtx->vtx->num_elements == 0) &&
+			(vertex_regid == regid(63, 0)) &&
+			(instance_regid == regid(63, 0)) &&
+			(vtxcnt_regid == regid(63, 0)))
 		return;
 
 	for (i = 0, j = 0; i <= last; i++) {
@@ -472,8 +428,9 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 			enum pipe_format pfmt = elem->src_format;
 			enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt);
 			bool switchnext = (i != last) ||
-				vertex_regid != regid(63, 0) ||
-				instance_regid != regid(63, 0);
+					(vertex_regid != regid(63, 0)) ||
+					(instance_regid != regid(63, 0)) ||
+					(vtxcnt_regid != regid(63, 0));
 			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t fs = util_format_get_blocksize(pfmt);
 
@@ -512,6 +469,10 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 	OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
 			A3XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
 			A3XX_VFD_CONTROL_1_REGID4INST(instance_regid));
+
+	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
+	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
+			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(vtxcnt_regid));
 }
 
 void
@@ -669,33 +630,12 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_EVENT_WRITE, 1);
 	OUT_RING(ring, HLSQ_FLUSH);
 
-	if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
-			/* evil hack to deal sanely with clear path: */
-			(emit->prog == &ctx->prog)) {
-		fd_wfi(ctx, ring);
-		emit_constants(ring,  SB_VERT_SHADER,
-				&ctx->constbuf[PIPE_SHADER_VERTEX],
-				vp, emit->prog->dirty & FD_SHADER_DIRTY_VP);
-		if (!emit->key.binning_pass) {
-			emit_constants(ring, SB_FRAG_SHADER,
-					&ctx->constbuf[PIPE_SHADER_FRAGMENT],
-					fp, emit->prog->dirty & FD_SHADER_DIRTY_FP);
-		}
-	}
-
-	/* emit driver params every time */
-	if (emit->info && emit->prog == &ctx->prog) {
-		uint32_t vertex_params[4] = {
-			emit->info->indexed ? emit->info->index_bias : emit->info->start,
-			0,
-			0,
-			0
-		};
-		if (vp->constlen >= vp->first_driver_param + 4) {
-			fd3_emit_constant(ring, SB_VERT_SHADER,
-							  (vp->first_driver_param + 4) * 4,
-							  0, 4, vertex_params, NULL);
-		}
+	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
+		ir3_emit_consts(vp, ring, emit->info, dirty);
+		if (!emit->key.binning_pass)
+			ir3_emit_consts(fp, ring, emit->info, dirty);
+		/* mark clean after emitting consts: */
+		ctx->prog.dirty = 0;
 	}
 
 	if ((dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) && ctx->blend) {
@@ -930,3 +870,11 @@ fd3_emit_restore(struct fd_context *ctx)
 
 	ctx->needs_rb_fbd = true;
 }
+
+void
+fd3_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_const = fd3_emit_const;
+	ctx->emit_const_bo = fd3_emit_const_bo;
+}
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
index 8f21919c9a7..795654706a7 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -37,10 +37,8 @@
 #include "ir3_shader.h"
 
 struct fd_ringbuffer;
-enum adreno_state_block;
 
-void fd3_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+void fd3_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc);
 
@@ -90,4 +88,6 @@ void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 void fd3_emit_restore(struct fd_context *ctx);
 
+void fd3_emit_init(struct pipe_context *pctx);
+
 #endif /* FD3_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.h b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
index 6afc3015901..05c5ea3d247 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.h
@@ -41,27 +41,4 @@ enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format);
 uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r,
 		unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a);
 
-static INLINE bool
-fd3_half_precision(const struct pipe_surface *surface)
-{
-	enum pipe_format format;
-	if (!surface)
-		return true;
-
-	format = surface->format;
-
-	/* colors are provided in consts, which go through cov.f32f16, which will
-	 * break these values
-	 */
-	if (util_format_is_pure_integer(format))
-		return false;
-
-	/* avoid losing precision on 32-bit float formats */
-	if (util_format_is_float(format) &&
-		util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32)
-		return false;
-
-	return true;
-}
-
 #endif /* FD3_FORMAT_H_ */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 7d3975761dd..9a5b45e2fcb 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -57,7 +57,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		tile_mode = LINEAR;
 	}
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) {
 		enum pipe_format pformat = 0;
 		enum a3xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
@@ -537,10 +537,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 			/* NOTE: They all use the same VP, this is for vtx bufs. */
 			.prog = &ctx->blit_prog[0],
 			.key = {
-				.half_precision = (fd3_half_precision(pfb->cbufs[0]) &&
-								   fd3_half_precision(pfb->cbufs[1]) &&
-								   fd3_half_precision(pfb->cbufs[2]) &&
-								   fd3_half_precision(pfb->cbufs[3]))
+				.half_precision = fd_half_precision(pfb),
 			},
 	};
 	float x0, y0, x1, y1;
@@ -654,6 +651,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 
 	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) {
 		emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1];
+		emit.fp = NULL;      /* frag shader changed so clear cache */
 		fd3_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs);
 		emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w);
 	}
@@ -674,6 +672,7 @@ fd3_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 				emit.prog = &ctx->blit_zs;
 			emit.key.half_precision = false;
 		}
+		emit.fp = NULL;      /* frag shader changed so clear cache */
 		fd3_program_emit(ring, &emit, 1, &pfb->zsbuf);
 		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
 	}
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 57fcaa9020e..b5360797745 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -51,7 +51,7 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state
 		enum shader_t type)
 {
 	struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj);
-	so->shader = ir3_shader_create(pctx, cso->tokens, type);
+	so->shader = ir3_shader_create(pctx, cso, type);
 	return so;
 }
 
@@ -136,6 +136,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 	int constmode;
 	int i, j, k;
 
+	debug_assert(nr <= ARRAY_SIZE(color_regid));
+
 	vp = fd3_emit_get_vp(emit);
 
 	if (emit->key.binning_pass) {
@@ -202,12 +204,12 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
 			ir3_find_output_regid(fp, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
 	} else {
-		for (int i = 0; i < fp->outputs_count; i++) {
+		for (i = 0; i < fp->outputs_count; i++) {
 			ir3_semantic sem = fp->outputs[i].semantic;
 			unsigned idx = sem2idx(sem);
 			if (sem2name(sem) != TGSI_SEMANTIC_COLOR)
 				continue;
-			assert(idx < 4);
+			debug_assert(idx < ARRAY_SIZE(color_regid));
 			color_regid[idx] = fp->outputs[i].regid;
 		}
 	}
@@ -449,10 +451,6 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 		OUT_RING(ring, flatshade[1]);        /* SP_FS_FLAT_SHAD_MODE_REG_1 */
 	}
 
-	OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1);
-	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
-			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252));
-
 	if (vpbuffer == BUFFER)
 		emit_shader(ring, vp);
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_query.c b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
index 7abab543427..8fc0a0d4229 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c
@@ -64,7 +64,7 @@ occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 
 	OUT_PKT3(ring, CP_DRAW_INDX, 3);
 	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, DRAW(DI_PT_POINTLIST_A2XX, DI_SRC_SEL_AUTO_INDEX,
+	OUT_RING(ring, DRAW(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
 						INDEX_SIZE_IGN, USE_VISIBILITY, 0));
 	OUT_RING(ring, 0);             /* NumIndices */
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
index 7e9c1f51f59..765d9719524 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h
@@ -44,7 +44,7 @@ struct fd3_rasterizer_stateobj {
 	uint32_t pc_prim_vtx_cntl;
 };
 
-static INLINE struct fd3_rasterizer_stateobj *
+static inline struct fd3_rasterizer_stateobj *
 fd3_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd3_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index 094dcf376e5..722fe360202 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -105,7 +105,7 @@ void
 fd3_screen_init(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
-	screen->max_rts = 4;
+	screen->max_rts = A3XX_MAX_RENDER_TARGETS;
 	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd3_context_create;
 	pscreen->is_format_supported = fd3_screen_is_format_supported;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index a278bf5c603..c30658d0e7b 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -210,8 +210,8 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 {
 	struct fd3_pipe_sampler_view *so = CALLOC_STRUCT(fd3_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
-	unsigned lvl = cso->u.tex.first_level;
-	unsigned miplevels = cso->u.tex.last_level - lvl;
+	unsigned lvl = fd_sampler_first_level(cso);
+	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
 	uint32_t sz2 = 0;
 
 	if (!so)
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
index c38fd847f27..d5afb03cd7a 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h
@@ -43,7 +43,7 @@ struct fd3_sampler_stateobj {
 	bool saturate_s, saturate_t, saturate_r;
 };
 
-static INLINE struct fd3_sampler_stateobj *
+static inline struct fd3_sampler_stateobj *
 fd3_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd3_sampler_stateobj *)samp;
@@ -54,7 +54,7 @@ struct fd3_pipe_sampler_view {
 	uint32_t texconst0, texconst1, texconst2, texconst3;
 };
 
-static INLINE struct fd3_pipe_sampler_view *
+static inline struct fd3_pipe_sampler_view *
 fd3_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd3_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
index 352c3dd5432..d4dc5954da5 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h
@@ -45,7 +45,7 @@ struct fd3_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd3_zsa_stateobj *
+static inline struct fd3_zsa_stateobj *
 fd3_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd3_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 0e7d3cf6db1..563f70ac5eb 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -227,6 +227,7 @@ enum a4xx_depth_format {
 	DEPTH4_NONE = 0,
 	DEPTH4_16 = 1,
 	DEPTH4_24_8 = 2,
+	DEPTH4_32 = 3,
 };
 
 enum a4xx_tess_spacing {
@@ -570,6 +571,15 @@ static inline uint32_t A4XX_RB_FS_OUTPUT_SAMPLE_MASK(uint32_t val)
 	return ((val) << A4XX_RB_FS_OUTPUT_SAMPLE_MASK__SHIFT) & A4XX_RB_FS_OUTPUT_SAMPLE_MASK__MASK;
 }
 
+#define REG_A4XX_RB_SAMPLE_COUNT_CONTROL			0x000020fa
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_COPY			0x00000002
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__MASK			0xfffffffc
+#define A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__SHIFT		2
+static inline uint32_t A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR(uint32_t val)
+{
+	return ((val >> 2) << A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__SHIFT) & A4XX_RB_SAMPLE_COUNT_CONTROL_ADDR__MASK;
+}
+
 #define REG_A4XX_RB_RENDER_COMPONENTS				0x000020fb
 #define A4XX_RB_RENDER_COMPONENTS_RT0__MASK			0x0000000f
 #define A4XX_RB_RENDER_COMPONENTS_RT0__SHIFT			0
@@ -811,6 +821,23 @@ static inline uint32_t A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(enum adreno_stencil_op v
 #define REG_A4XX_RB_STENCIL_CONTROL2				0x00002107
 #define A4XX_RB_STENCIL_CONTROL2_STENCIL_BUFFER			0x00000001
 
+#define REG_A4XX_RB_STENCIL_INFO				0x00002108
+#define A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL			0x00000001
+#define A4XX_RB_STENCIL_INFO_STENCIL_BASE__MASK			0xfffff000
+#define A4XX_RB_STENCIL_INFO_STENCIL_BASE__SHIFT		12
+static inline uint32_t A4XX_RB_STENCIL_INFO_STENCIL_BASE(uint32_t val)
+{
+	return ((val >> 12) << A4XX_RB_STENCIL_INFO_STENCIL_BASE__SHIFT) & A4XX_RB_STENCIL_INFO_STENCIL_BASE__MASK;
+}
+
+#define REG_A4XX_RB_STENCIL_PITCH				0x00002109
+#define A4XX_RB_STENCIL_PITCH__MASK				0xffffffff
+#define A4XX_RB_STENCIL_PITCH__SHIFT				0
+static inline uint32_t A4XX_RB_STENCIL_PITCH(uint32_t val)
+{
+	return ((val >> 5) << A4XX_RB_STENCIL_PITCH__SHIFT) & A4XX_RB_STENCIL_PITCH__MASK;
+}
+
 #define REG_A4XX_RB_STENCILREFMASK				0x0000210b
 #define A4XX_RB_STENCILREFMASK_STENCILREF__MASK			0x000000ff
 #define A4XX_RB_STENCILREFMASK_STENCILREF__SHIFT		0
@@ -1167,6 +1194,8 @@ static inline uint32_t REG_A4XX_CP_SCRATCH_REG(uint32_t i0) { return 0x00000578
 
 #define REG_A4XX_SP_VS_STATUS					0x00000ec0
 
+#define REG_A4XX_SP_MODE_CONTROL				0x00000ec3
+
 #define REG_A4XX_SP_PERFCTR_SP_SEL_11				0x00000ecf
 
 #define REG_A4XX_SP_SP_CTRL_REG					0x000022c0
@@ -1432,6 +1461,20 @@ static inline uint32_t A4XX_SP_FS_MRT_REG_MRTFORMAT(enum a4xx_color_fmt val)
 	return ((val) << A4XX_SP_FS_MRT_REG_MRTFORMAT__SHIFT) & A4XX_SP_FS_MRT_REG_MRTFORMAT__MASK;
 }
 
+#define REG_A4XX_SP_CS_CTRL_REG0				0x00002300
+
+#define REG_A4XX_SP_CS_OBJ_OFFSET_REG				0x00002301
+
+#define REG_A4XX_SP_CS_OBJ_START				0x00002302
+
+#define REG_A4XX_SP_CS_PVT_MEM_PARAM				0x00002303
+
+#define REG_A4XX_SP_CS_PVT_MEM_ADDR				0x00002304
+
+#define REG_A4XX_SP_CS_PVT_MEM_SIZE				0x00002305
+
+#define REG_A4XX_SP_CS_LENGTH_REG				0x00002306
+
 #define REG_A4XX_SP_HS_OBJ_OFFSET_REG				0x0000230d
 #define A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1454,6 +1497,76 @@ static inline uint32_t A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
 
 #define REG_A4XX_SP_HS_LENGTH_REG				0x00002312
 
+#define REG_A4XX_SP_DS_PARAM_REG				0x0000231a
+#define A4XX_SP_DS_PARAM_REG_POSREGID__MASK			0x000000ff
+#define A4XX_SP_DS_PARAM_REG_POSREGID__SHIFT			0
+static inline uint32_t A4XX_SP_DS_PARAM_REG_POSREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_PARAM_REG_POSREGID__SHIFT) & A4XX_SP_DS_PARAM_REG_POSREGID__MASK;
+}
+#define A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__MASK		0xfff00000
+#define A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__SHIFT		20
+static inline uint32_t A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__SHIFT) & A4XX_SP_DS_PARAM_REG_TOTALGSOUTVAR__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_DS_OUT(uint32_t i0) { return 0x0000231b + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_DS_OUT_REG(uint32_t i0) { return 0x0000231b + 0x1*i0; }
+#define A4XX_SP_DS_OUT_REG_A_REGID__MASK			0x000001ff
+#define A4XX_SP_DS_OUT_REG_A_REGID__SHIFT			0
+static inline uint32_t A4XX_SP_DS_OUT_REG_A_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_A_REGID__SHIFT) & A4XX_SP_DS_OUT_REG_A_REGID__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_A_COMPMASK__MASK			0x00001e00
+#define A4XX_SP_DS_OUT_REG_A_COMPMASK__SHIFT			9
+static inline uint32_t A4XX_SP_DS_OUT_REG_A_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_A_COMPMASK__SHIFT) & A4XX_SP_DS_OUT_REG_A_COMPMASK__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_B_REGID__MASK			0x01ff0000
+#define A4XX_SP_DS_OUT_REG_B_REGID__SHIFT			16
+static inline uint32_t A4XX_SP_DS_OUT_REG_B_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_B_REGID__SHIFT) & A4XX_SP_DS_OUT_REG_B_REGID__MASK;
+}
+#define A4XX_SP_DS_OUT_REG_B_COMPMASK__MASK			0x1e000000
+#define A4XX_SP_DS_OUT_REG_B_COMPMASK__SHIFT			25
+static inline uint32_t A4XX_SP_DS_OUT_REG_B_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_OUT_REG_B_COMPMASK__SHIFT) & A4XX_SP_DS_OUT_REG_B_COMPMASK__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_DS_VPC_DST(uint32_t i0) { return 0x0000232c + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_DS_VPC_DST_REG(uint32_t i0) { return 0x0000232c + 0x1*i0; }
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC0__MASK			0x000000ff
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC0__SHIFT			0
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC0(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC0__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC0__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC1__MASK			0x0000ff00
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC1__SHIFT			8
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC1(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC1__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC1__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC2__MASK			0x00ff0000
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC2__SHIFT			16
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC2(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC2__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC2__MASK;
+}
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC3__MASK			0xff000000
+#define A4XX_SP_DS_VPC_DST_REG_OUTLOC3__SHIFT			24
+static inline uint32_t A4XX_SP_DS_VPC_DST_REG_OUTLOC3(uint32_t val)
+{
+	return ((val) << A4XX_SP_DS_VPC_DST_REG_OUTLOC3__SHIFT) & A4XX_SP_DS_VPC_DST_REG_OUTLOC3__MASK;
+}
+
 #define REG_A4XX_SP_DS_OBJ_OFFSET_REG				0x00002334
 #define A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1476,6 +1589,82 @@ static inline uint32_t A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
 
 #define REG_A4XX_SP_DS_LENGTH_REG				0x00002339
 
+#define REG_A4XX_SP_GS_PARAM_REG				0x00002341
+#define A4XX_SP_GS_PARAM_REG_POSREGID__MASK			0x000000ff
+#define A4XX_SP_GS_PARAM_REG_POSREGID__SHIFT			0
+static inline uint32_t A4XX_SP_GS_PARAM_REG_POSREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_POSREGID__SHIFT) & A4XX_SP_GS_PARAM_REG_POSREGID__MASK;
+}
+#define A4XX_SP_GS_PARAM_REG_PRIMREGID__MASK			0x0000ff00
+#define A4XX_SP_GS_PARAM_REG_PRIMREGID__SHIFT			8
+static inline uint32_t A4XX_SP_GS_PARAM_REG_PRIMREGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_PRIMREGID__SHIFT) & A4XX_SP_GS_PARAM_REG_PRIMREGID__MASK;
+}
+#define A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__MASK		0xfff00000
+#define A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__SHIFT		20
+static inline uint32_t A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__SHIFT) & A4XX_SP_GS_PARAM_REG_TOTALGSOUTVAR__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_GS_OUT(uint32_t i0) { return 0x00002342 + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_GS_OUT_REG(uint32_t i0) { return 0x00002342 + 0x1*i0; }
+#define A4XX_SP_GS_OUT_REG_A_REGID__MASK			0x000001ff
+#define A4XX_SP_GS_OUT_REG_A_REGID__SHIFT			0
+static inline uint32_t A4XX_SP_GS_OUT_REG_A_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_A_REGID__SHIFT) & A4XX_SP_GS_OUT_REG_A_REGID__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_A_COMPMASK__MASK			0x00001e00
+#define A4XX_SP_GS_OUT_REG_A_COMPMASK__SHIFT			9
+static inline uint32_t A4XX_SP_GS_OUT_REG_A_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_A_COMPMASK__SHIFT) & A4XX_SP_GS_OUT_REG_A_COMPMASK__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_B_REGID__MASK			0x01ff0000
+#define A4XX_SP_GS_OUT_REG_B_REGID__SHIFT			16
+static inline uint32_t A4XX_SP_GS_OUT_REG_B_REGID(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_B_REGID__SHIFT) & A4XX_SP_GS_OUT_REG_B_REGID__MASK;
+}
+#define A4XX_SP_GS_OUT_REG_B_COMPMASK__MASK			0x1e000000
+#define A4XX_SP_GS_OUT_REG_B_COMPMASK__SHIFT			25
+static inline uint32_t A4XX_SP_GS_OUT_REG_B_COMPMASK(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_OUT_REG_B_COMPMASK__SHIFT) & A4XX_SP_GS_OUT_REG_B_COMPMASK__MASK;
+}
+
+static inline uint32_t REG_A4XX_SP_GS_VPC_DST(uint32_t i0) { return 0x00002353 + 0x1*i0; }
+
+static inline uint32_t REG_A4XX_SP_GS_VPC_DST_REG(uint32_t i0) { return 0x00002353 + 0x1*i0; }
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC0__MASK			0x000000ff
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC0__SHIFT			0
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC0(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC0__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC0__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC1__MASK			0x0000ff00
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC1__SHIFT			8
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC1(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC1__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC1__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC2__MASK			0x00ff0000
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC2__SHIFT			16
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC2(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC2__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC2__MASK;
+}
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC3__MASK			0xff000000
+#define A4XX_SP_GS_VPC_DST_REG_OUTLOC3__SHIFT			24
+static inline uint32_t A4XX_SP_GS_VPC_DST_REG_OUTLOC3(uint32_t val)
+{
+	return ((val) << A4XX_SP_GS_VPC_DST_REG_OUTLOC3__SHIFT) & A4XX_SP_GS_VPC_DST_REG_OUTLOC3__MASK;
+}
+
 #define REG_A4XX_SP_GS_OBJ_OFFSET_REG				0x0000235b
 #define A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
@@ -1677,6 +1866,18 @@ static inline uint32_t A4XX_VFD_CONTROL_3_REGID_VTXCNT(uint32_t val)
 {
 	return ((val) << A4XX_VFD_CONTROL_3_REGID_VTXCNT__SHIFT) & A4XX_VFD_CONTROL_3_REGID_VTXCNT__MASK;
 }
+#define A4XX_VFD_CONTROL_3_REGID_TESSX__MASK			0x00ff0000
+#define A4XX_VFD_CONTROL_3_REGID_TESSX__SHIFT			16
+static inline uint32_t A4XX_VFD_CONTROL_3_REGID_TESSX(uint32_t val)
+{
+	return ((val) << A4XX_VFD_CONTROL_3_REGID_TESSX__SHIFT) & A4XX_VFD_CONTROL_3_REGID_TESSX__MASK;
+}
+#define A4XX_VFD_CONTROL_3_REGID_TESSY__MASK			0xff000000
+#define A4XX_VFD_CONTROL_3_REGID_TESSY__SHIFT			24
+static inline uint32_t A4XX_VFD_CONTROL_3_REGID_TESSY(uint32_t val)
+{
+	return ((val) << A4XX_VFD_CONTROL_3_REGID_TESSY__SHIFT) & A4XX_VFD_CONTROL_3_REGID_TESSY__MASK;
+}
 
 #define REG_A4XX_VFD_CONTROL_4					0x00002204
 
@@ -1758,6 +1959,8 @@ static inline uint32_t A4XX_VFD_DECODE_INSTR_SHIFTCNT(uint32_t val)
 
 #define REG_A4XX_TPL1_DEBUG_ECO_CONTROL				0x00000f00
 
+#define REG_A4XX_TPL1_TP_MODE_CONTROL				0x00000f03
+
 #define REG_A4XX_TPL1_PERFCTR_TP_SEL_7				0x00000f0b
 
 #define REG_A4XX_TPL1_TP_TEX_OFFSET				0x00002380
@@ -1800,6 +2003,10 @@ static inline uint32_t A4XX_TPL1_TP_TEX_COUNT_GS(uint32_t val)
 
 #define REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR		0x000023a1
 
+#define REG_A4XX_TPL1_TP_CS_BORDER_COLOR_BASE_ADDR		0x000023a4
+
+#define REG_A4XX_TPL1_TP_CS_SAMPLER_BASE_ADDR			0x000023a5
+
 #define REG_A4XX_TPL1_TP_CS_TEXMEMOBJ_BASE_ADDR			0x000023a6
 
 #define REG_A4XX_GRAS_TSE_STATUS				0x00000c80
@@ -2078,6 +2285,8 @@ static inline uint32_t A4XX_GRAS_SC_EXTENT_WINDOW_TL_Y(uint32_t val)
 
 #define REG_A4XX_HLSQ_DEBUG_ECO_CONTROL				0x00000e04
 
+#define REG_A4XX_HLSQ_MODE_CONTROL				0x00000e05
+
 #define REG_A4XX_HLSQ_PERF_PIPE_MASK				0x00000e0e
 
 #define REG_A4XX_HLSQ_CONTROL_0_REG				0x000023c0
@@ -2158,6 +2367,8 @@ static inline uint32_t A4XX_HLSQ_CONTROL_3_REG_REGID(uint32_t val)
 	return ((val) << A4XX_HLSQ_CONTROL_3_REG_REGID__SHIFT) & A4XX_HLSQ_CONTROL_3_REG_REGID__MASK;
 }
 
+#define REG_A4XX_HLSQ_CONTROL_4_REG				0x000023c4
+
 #define REG_A4XX_HLSQ_VS_CONTROL_REG				0x000023c5
 #define A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK		0x000000ff
 #define A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT		0
@@ -2293,6 +2504,36 @@ static inline uint32_t A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(uint32_t val)
 	return ((val) << A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH__SHIFT) & A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH__MASK;
 }
 
+#define REG_A4XX_HLSQ_CS_CONTROL				0x000023ca
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_0				0x000023cd
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_1				0x000023ce
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_2				0x000023cf
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_3				0x000023d0
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_4				0x000023d1
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_5				0x000023d2
+
+#define REG_A4XX_HLSQ_CL_NDRANGE_6				0x000023d3
+
+#define REG_A4XX_HLSQ_CL_CONTROL_0				0x000023d4
+
+#define REG_A4XX_HLSQ_CL_CONTROL_1				0x000023d5
+
+#define REG_A4XX_HLSQ_CL_KERNEL_CONST				0x000023d6
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_X				0x000023d7
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_Y				0x000023d8
+
+#define REG_A4XX_HLSQ_CL_KERNEL_GROUP_Z				0x000023d9
+
+#define REG_A4XX_HLSQ_CL_WG_OFFSET				0x000023da
+
 #define REG_A4XX_HLSQ_UPDATE_CONTROL				0x000023db
 
 #define REG_A4XX_PC_BINNING_COMMAND				0x00000d00
@@ -2389,16 +2630,10 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val)
 
 #define REG_A4XX_UNKNOWN_0D01					0x00000d01
 
-#define REG_A4XX_UNKNOWN_0E05					0x00000e05
-
 #define REG_A4XX_UNKNOWN_0E42					0x00000e42
 
 #define REG_A4XX_UNKNOWN_0EC2					0x00000ec2
 
-#define REG_A4XX_UNKNOWN_0EC3					0x00000ec3
-
-#define REG_A4XX_UNKNOWN_0F03					0x00000f03
-
 #define REG_A4XX_UNKNOWN_2001					0x00002001
 
 #define REG_A4XX_UNKNOWN_209B					0x0000209b
@@ -2439,6 +2674,8 @@ static inline uint32_t A4XX_UNKNOWN_20F7(float val)
 
 #define REG_A4XX_UNKNOWN_22D7					0x000022d7
 
+#define REG_A4XX_UNKNOWN_2352					0x00002352
+
 #define REG_A4XX_TEX_SAMP_0					0x00000000
 #define A4XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR			0x00000001
 #define A4XX_TEX_SAMP_0_XY_MAG__MASK				0x00000006
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
index 396caa532fc..d5e823ef69d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c
@@ -61,7 +61,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 	struct fd4_blend_stateobj *so;
 //	enum a3xx_rop_code rop = ROP_COPY;
 	bool reads_dest = false;
-	int i;
+	unsigned i, mrt_blend = 0;
 
 	if (cso->logicop_enable) {
 //		rop = cso->logicop_func;  /* maps 1:1 */
@@ -84,11 +84,6 @@ fd4_blend_state_create(struct pipe_context *pctx,
 		}
 	}
 
-	if (cso->independent_blend_enable) {
-		DBG("Unsupported! independent blend state");
-		return NULL;
-	}
-
 	so = CALLOC_STRUCT(fd4_blend_stateobj);
 	if (!so)
 		return NULL;
@@ -96,7 +91,12 @@ fd4_blend_state_create(struct pipe_context *pctx,
 	so->base = *cso;
 
 	for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) {
-		const struct pipe_rt_blend_state *rt = &cso->rt[i];
+		const struct pipe_rt_blend_state *rt;
+
+		if (cso->independent_blend_enable)
+			rt = &cso->rt[i];
+		else
+			rt = &cso->rt[0];
 
 		so->rb_mrt[i].blend_control =
 				A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) |
@@ -115,7 +115,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 					A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE |
 					A4XX_RB_MRT_CONTROL_BLEND |
 					A4XX_RB_MRT_CONTROL_BLEND2;
-			so->rb_fs_output |= A4XX_RB_FS_OUTPUT_ENABLE_BLEND(1);
+			mrt_blend |= (1 << i);
 		}
 
 		if (reads_dest)
@@ -125,5 +125,7 @@ fd4_blend_state_create(struct pipe_context *pctx,
 			so->rb_mrt[i].buf_info |= A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS);
 	}
 
+	so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend);
+
 	return so;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
index 33641da5e2c..7620d00a625 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h
@@ -32,17 +32,19 @@
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 struct fd4_blend_stateobj {
 	struct pipe_blend_state base;
 	struct {
 		uint32_t control;
 		uint32_t buf_info;
 		uint32_t blend_control;
-	} rb_mrt[8];
+	} rb_mrt[A4XX_MAX_RENDER_TARGETS];
 	uint32_t rb_fs_output;
 };
 
-static INLINE struct fd4_blend_stateobj *
+static inline struct fd4_blend_stateobj *
 fd4_blend_stateobj(struct pipe_blend_state *blend)
 {
 	return (struct fd4_blend_stateobj *)blend;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index 2321876dd48..e172d350517 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -86,7 +86,7 @@ create_blit_texcoord_vertexbuf(struct pipe_context *pctx)
 }
 
 static const uint8_t primtypes[PIPE_PRIM_MAX] = {
-		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_A3XX,
+		[PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
 		[PIPE_PRIM_LINES]          = DI_PT_LINELIST,
 		[PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
 		[PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
@@ -119,6 +119,7 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv)
 	fd4_gmem_init(pctx);
 	fd4_texture_init(pctx);
 	fd4_prog_init(pctx);
+	fd4_emit_init(pctx);
 
 	pctx = fd_context_init(&fd4_ctx->base, pscreen, primtypes, priv);
 	if (!pctx)
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index 53e1bf6a2e6..0b749916841 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -90,7 +90,7 @@ struct fd4_context {
 	struct ir3_shader_key last_key;
 };
 
-static INLINE struct fd4_context *
+static inline struct fd4_context *
 fd4_context(struct fd_context *ctx)
 {
 	return (struct fd4_context *)ctx;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index de5a306af60..2bd2ca23d54 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -48,6 +48,9 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 {
 	const struct pipe_draw_info *info = emit->info;
 
+	if (!(fd4_emit_get_vp(emit) && fd4_emit_get_fp(emit)))
+		return;
+
 	fd4_emit_state(ctx, ring, emit);
 
 	if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE))
@@ -108,7 +111,6 @@ static void
 fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 {
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 		.vtx  = &ctx->vtx,
 		.prog = &ctx->prog,
@@ -129,8 +131,9 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			.fsaturate_t = fd4_ctx->fsaturate_t,
 			.fsaturate_r = fd4_ctx->fsaturate_r,
 		},
-		.format = fd4_emit_format(pfb->cbufs[0]),
-		.pformat = pipe_surface_format(pfb->cbufs[0]),
+		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
+		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : false,
+		.sprite_coord_mode = ctx->rasterizer ? ctx->rasterizer->sprite_coord_mode : false,
 	};
 	unsigned dirty;
 
@@ -170,20 +173,16 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
 	unsigned dirty = ctx->dirty;
-	unsigned ce, i;
+	unsigned i;
 	struct fd4_emit emit = {
 		.vtx  = &fd4_ctx->solid_vbuf_state,
 		.prog = &ctx->solid_prog,
 		.key = {
-			.half_precision = true,
+			.half_precision = fd_half_precision(pfb),
 		},
-		.format = fd4_emit_format(pfb->cbufs[0]),
 	};
-	uint32_t colr = 0;
-
-	if ((buffers & PIPE_CLEAR_COLOR) && pfb->nr_cbufs)
-		colr  = pack_rgba(pfb->cbufs[0]->format, color->f);
 
 	dirty &= FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
 	dirty |= FD_DIRTY_PROG;
@@ -257,16 +256,15 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR) {
 		OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1);
 		OUT_RING(ring, A4XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER));
-		ce = 0xf;
-	} else {
-		ce = 0x0;
 	}
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = (buffers & (PIPE_CLEAR_COLOR0 << i)) ? 0xf : 0x0;
+
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
 				A4XX_RB_MRT_CONTROL_B11 |
-				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(ce));
+				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf));
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) |
@@ -277,6 +275,16 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO));
 	}
 
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
@@ -285,14 +293,8 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT0(ring, REG_A4XX_RB_CLEAR_COLOR_DW0, 4);
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW0 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW1 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW2 */
-	OUT_RING(ring, colr);         /* RB_CLEAR_COLOR_DW3 */
-
 	/* until fastclear works: */
-	fd4_emit_constant(ring, SB_FRAG_SHADER, 0, 0, 4, color->ui, NULL);
+	fd4_emit_const(ring, SHADER_FRAGMENT, 0, 0, 4, color->ui, NULL);
 
 	OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2);
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
index 1bd376ca6ec..b89a30a7c4b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
@@ -106,6 +106,7 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 {
 	struct pipe_index_buffer *idx = &ctx->indexbuf;
 	struct fd_bo *idx_bo = NULL;
+	enum pc_di_primtype primtype = ctx->primtypes[info->mode];
 	enum a4xx_index_size idx_type;
 	enum pc_di_src_sel src_sel;
 	uint32_t idx_size, idx_offset;
@@ -126,7 +127,12 @@ fd4_draw_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		src_sel = DI_SRC_SEL_AUTO_INDEX;
 	}
 
-	fd4_draw(ctx, ring, ctx->primtypes[info->mode], vismode, src_sel,
+	/* points + psize -> spritelist: */
+	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
+			(info->mode == PIPE_PRIM_POINTS))
+		primtype = DI_PT_POINTLIST_PSIZE;
+
+	fd4_draw(ctx, ring, primtype, vismode, src_sel,
 			info->count, info->instance_count,
 			idx_type, idx_size, idx_offset, idx_bo);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 4b6eb646aa7..b75be29e523 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -43,19 +43,26 @@
 #include "fd4_format.h"
 #include "fd4_zsa.h"
 
+static const enum adreno_state_block sb[] = {
+	[SHADER_VERTEX]   = SB_VERT_SHADER,
+	[SHADER_FRAGMENT] = SB_FRAG_SHADER,
+};
+
 /* regid:          base const register
  * prsc or dwords: buffer containing constant values
  * sizedwords:     size of const value buffer
  */
 void
-fd4_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
 	uint32_t i, sz;
 	enum adreno_state_src src;
 
+	debug_assert((regid % 4) == 0);
+	debug_assert((sizedwords % 4) == 0);
+
 	if (prsc) {
 		sz = 0;
 		src = 0x2;  // TODO ??
@@ -67,7 +74,7 @@ fd4_emit_constant(struct fd_ringbuffer *ring,
 	OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
 			CP_LOAD_STATE_0_STATE_SRC(src) |
-			CP_LOAD_STATE_0_STATE_BLOCK(sb) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
 			CP_LOAD_STATE_0_NUM_UNIT(sizedwords/4));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
@@ -84,89 +91,31 @@ fd4_emit_constant(struct fd_ringbuffer *ring,
 }
 
 static void
-emit_constants(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
-		struct fd_constbuf_stateobj *constbuf,
-		struct ir3_shader_variant *shader,
-		bool emit_immediates)
+fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+		uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets)
 {
-	uint32_t enabled_mask = constbuf->enabled_mask;
-	uint32_t max_const;
-	int i;
+	uint32_t i;
 
-	// XXX TODO only emit dirty consts.. but we need to keep track if
-	// they are clobbered by a clear, gmem2mem, or mem2gmem..
-	constbuf->dirty_mask = enabled_mask;
+	debug_assert((regid % 4) == 0);
+	debug_assert((num % 4) == 0);
 
-	/* in particular, with binning shader we may end up with unused
-	 * consts, ie. we could end up w/ constlen that is smaller
-	 * than first_immediate.  In that case truncate the user consts
-	 * early to avoid HLSQ lockup caused by writing too many consts
-	 */
-	max_const = MIN2(shader->first_driver_param, shader->constlen);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
+			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
+			CP_LOAD_STATE_0_NUM_UNIT(num/4));
+	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
 
-	/* emit user constants: */
-	if (enabled_mask & 1) {
-		const unsigned index = 0;
-		struct pipe_constant_buffer *cb = &constbuf->cb[index];
-		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
-
-		// I expect that size should be a multiple of vec4's:
-		assert(size == align(size, 4));
-
-		/* and even if the start of the const buffer is before
-		 * first_immediate, the end may not be:
-		 */
-		size = MIN2(size, 4 * max_const);
-
-		if (size && (constbuf->dirty_mask & (1 << index))) {
-			fd4_emit_constant(ring, sb, 0,
-					cb->buffer_offset, size,
-					cb->user_buffer, cb->buffer);
-			constbuf->dirty_mask &= ~(1 << index);
-		}
-
-		enabled_mask &= ~(1 << index);
-	}
-
-	/* emit ubos: */
-	if (shader->constlen > shader->first_driver_param) {
-		uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param);
-		OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4);
-		OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param) |
-				CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
-				CP_LOAD_STATE_0_STATE_BLOCK(sb) |
-				CP_LOAD_STATE_0_NUM_UNIT(params));
-		OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
-				CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
-
-		for (i = 1; i <= params * 4; i++) {
-			struct pipe_constant_buffer *cb = &constbuf->cb[i];
-			assert(!cb->user_buffer);
-			if ((enabled_mask & (1 << i)) && cb->buffer)
-				OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0);
-			else
-				OUT_RING(ring, 0xbad00000 | ((i - 1) << 16));
-		}
-	}
-
-	/* emit shader immediates: */
-	if (shader && emit_immediates) {
-		int size = shader->immediates_count;
-		uint32_t base = shader->first_immediate;
-
-		/* truncate size to avoid writing constants that shader
-		 * does not use:
-		 */
-		size = MIN2(size + base, shader->constlen) - base;
-
-		/* convert out of vec4: */
-		base *= 4;
-		size *= 4;
-
-		if (size > 0) {
-			fd4_emit_constant(ring, sb, base,
-				0, size, shader->immediates[0].val, NULL);
+	for (i = 0; i < num; i++) {
+		if (bos[i]) {
+			if (write) {
+				OUT_RELOCW(ring, bos[i], offsets[i], 0, 0);
+			} else {
+				OUT_RELOC(ring, bos[i], offsets[i], 0, 0);
+			}
+		} else {
+			OUT_RING(ring, 0xbad00000 | (i << 16));
 		}
 	}
 }
@@ -223,15 +172,19 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 			const struct fd4_pipe_sampler_view *view = tex->textures[i] ?
 					fd4_pipe_sampler_view(tex->textures[i]) :
 					&dummy_view;
-			struct fd_resource *rsc = fd_resource(view->base.texture);
-			unsigned start = view->base.u.tex.first_level;
-			uint32_t offset = fd_resource_offset(rsc, start, 0);
+			unsigned start = fd_sampler_first_level(&view->base);
 
 			OUT_RING(ring, view->texconst0);
 			OUT_RING(ring, view->texconst1);
 			OUT_RING(ring, view->texconst2);
 			OUT_RING(ring, view->texconst3);
-			OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+			if (view->base.texture) {
+				struct fd_resource *rsc = fd_resource(view->base.texture);
+				uint32_t offset = fd_resource_offset(rsc, start, 0);
+				OUT_RELOC(ring, rsc->bo, offset, view->textconst4, 0);
+			} else {
+				OUT_RING(ring, 0x00000000);
+			}
 			OUT_RING(ring, 0x00000000);
 			OUT_RING(ring, 0x00000000);
 			OUT_RING(ring, 0x00000000);
@@ -244,51 +197,110 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
  * special cases..
  */
 void
-fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, struct pipe_surface *psurf)
+fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs,
+		struct pipe_surface **bufs)
 {
-	struct fd_resource *rsc = fd_resource(psurf->texture);
-	unsigned lvl = psurf->u.tex.level;
-	struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
-	uint32_t offset = fd_resource_offset(rsc, lvl, psurf->u.tex.first_layer);
-	enum pipe_format format = fd4_gmem_restore_format(psurf->format);
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS];
+	int i;
 
-	debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = (i < nr_bufs) ? 0xf : 0;
+	}
 
 	/* output sampler state: */
-	OUT_PKT3(ring, CP_LOAD_STATE, 4);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
 			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
 			CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) |
-			CP_LOAD_STATE_0_NUM_UNIT(1));
+			CP_LOAD_STATE_0_NUM_UNIT(nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) |
 			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) |
-			A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) |
-			A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) |
-			A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) |
-			A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT));
-	OUT_RING(ring, 0x00000000);
+	for (i = 0; i < nr_bufs; i++) {
+		OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) |
+				A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) |
+				A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) |
+				A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) |
+				A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT));
+		OUT_RING(ring, 0x00000000);
+	}
 
 	/* emit texture state: */
-	OUT_PKT3(ring, CP_LOAD_STATE, 10);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + (8 * nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
 			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
 			CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) |
-			CP_LOAD_STATE_0_NUM_UNIT(1));
+			CP_LOAD_STATE_0_NUM_UNIT(nr_bufs));
 	OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) |
 			CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) |
-			A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
-			fd4_tex_swiz(format,  PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN,
-					PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA));
-	OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(psurf->width) |
-			A4XX_TEX_CONST_1_HEIGHT(psurf->height));
-	OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp));
-	OUT_RING(ring, 0x00000000);
-	OUT_RELOC(ring, rsc->bo, offset, 0, 0);
-	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, 0x00000000);
-	OUT_RING(ring, 0x00000000);
+	for (i = 0; i < nr_bufs; i++) {
+		if (bufs[i]) {
+			struct fd_resource *rsc = fd_resource(bufs[i]->texture);
+			/* note: PIPE_BUFFER disallowed for surfaces */
+			unsigned lvl = bufs[i]->u.tex.level;
+			struct fd_resource_slice *slice = fd_resource_slice(rsc, lvl);
+			uint32_t offset = fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer);
+			enum pipe_format format = fd4_gmem_restore_format(bufs[i]->format);
+
+			/* The restore blit_zs shader expects stencil in sampler 0,
+			 * and depth in sampler 1
+			 */
+			if (rsc->stencil && (i == 0)) {
+				rsc = rsc->stencil;
+				format = fd4_gmem_restore_format(rsc->base.b.format);
+			}
+
+			/* z32 restore is accomplished using depth write.  If there is
+			 * no stencil component (ie. PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+			 * then no render target:
+			 *
+			 * (The same applies for z32_s8x24, since for stencil sampler
+			 * state the above 'if' will replace 'format' with s8)
+			 */
+			if ((format == PIPE_FORMAT_Z32_FLOAT) ||
+					(format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT))
+				mrt_comp[i] = 0;
+
+			debug_assert(bufs[i]->u.tex.first_layer == bufs[i]->u.tex.last_layer);
+
+			OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) |
+					A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
+					fd4_tex_swiz(format,  PIPE_SWIZZLE_RED, PIPE_SWIZZLE_GREEN,
+							PIPE_SWIZZLE_BLUE, PIPE_SWIZZLE_ALPHA));
+			OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(bufs[i]->width) |
+					A4XX_TEX_CONST_1_HEIGHT(bufs[i]->height));
+			OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(slice->pitch * rsc->cpp));
+			OUT_RING(ring, 0x00000000);
+			OUT_RELOC(ring, rsc->bo, offset, 0, 0);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		} else {
+			OUT_RING(ring, A4XX_TEX_CONST_0_FMT(0) |
+					A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) |
+					A4XX_TEX_CONST_0_SWIZ_X(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_Y(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_Z(A4XX_TEX_ONE) |
+					A4XX_TEX_CONST_0_SWIZ_W(A4XX_TEX_ONE));
+			OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(0) |
+					A4XX_TEX_CONST_1_HEIGHT(0));
+			OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(0));
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
+	}
+
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
 }
 
 void
@@ -298,7 +310,9 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 	uint32_t total_in = 0;
 	const struct fd_vertex_state *vtx = emit->vtx;
 	struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
-	unsigned vertex_regid = regid(63, 0), instance_regid = regid(63, 0);
+	unsigned vertex_regid = regid(63, 0);
+	unsigned instance_regid = regid(63, 0);
+	unsigned vtxcnt_regid = regid(63, 0);
 
 	for (i = 0; i < vp->inputs_count; i++) {
 		uint8_t semantic = sem2name(vp->inputs[i].semantic);
@@ -306,6 +320,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			vertex_regid = vp->inputs[i].regid;
 		else if (semantic == TGSI_SEMANTIC_INSTANCEID)
 			instance_regid = vp->inputs[i].regid;
+		else if (semantic == IR3_SEMANTIC_VTXCNT)
+			vtxcnt_regid = vp->inputs[i].regid;
 		else if ((i < vtx->vtx->num_elements) && vp->inputs[i].compmask)
 			last = i;
 	}
@@ -313,7 +329,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 	/* hw doesn't like to be configured for zero vbo's, it seems: */
 	if ((vtx->vtx->num_elements == 0) &&
 			(vertex_regid == regid(63, 0)) &&
-			(instance_regid == regid(63, 0)))
+			(instance_regid == regid(63, 0)) &&
+			(vtxcnt_regid == regid(63, 0)))
 		return;
 
 	for (i = 0, j = 0; i <= last; i++) {
@@ -327,7 +344,8 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			enum a4xx_vtx_fmt fmt = fd4_pipe2vtx(pfmt);
 			bool switchnext = (i != last) ||
 					(vertex_regid != regid(63, 0)) ||
-					(instance_regid != regid(63, 0));
+					(instance_regid != regid(63, 0)) ||
+					(vtxcnt_regid != regid(63, 0));
 			bool isint = util_format_is_pure_integer(pfmt);
 			uint32_t fs = util_format_get_blocksize(pfmt);
 			uint32_t off = vb->buffer_offset + elem->src_offset;
@@ -368,7 +386,7 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			A4XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
 			A4XX_VFD_CONTROL_1_REGID4INST(instance_regid));
 	OUT_RING(ring, 0x00000000);   /* XXX VFD_CONTROL_2 */
-	OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(regid(63, 0)));
+	OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(vtxcnt_regid));
 	OUT_RING(ring, 0x00000000);   /* XXX VFD_CONTROL_4 */
 
 	/* cache invalidate, otherwise vertex fetch could see
@@ -389,6 +407,25 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	emit_marker(ring, 5);
 
+	if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->key.binning_pass) {
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
+
+		for (unsigned i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+			mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0;
+		}
+
+		OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+		OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+				A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+				A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+				A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+				A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+				A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+				A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+				A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+	}
+
 	if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !emit->key.binning_pass) {
 		uint32_t val = fd4_zsa_stateobj(ctx->zsa)->rb_render_control;
 
@@ -513,43 +550,24 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2]));
 	}
 
-	if (dirty & FD_DIRTY_PROG)
-		fd4_program_emit(ring, emit);
-
-	if ((dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) &&
-			/* evil hack to deal sanely with clear path: */
-			(emit->prog == &ctx->prog)) {
-		fd_wfi(ctx, ring);
-		emit_constants(ring,  SB_VERT_SHADER,
-				&ctx->constbuf[PIPE_SHADER_VERTEX],
-				vp, emit->prog->dirty & FD_SHADER_DIRTY_VP);
-		if (!emit->key.binning_pass) {
-			emit_constants(ring, SB_FRAG_SHADER,
-					&ctx->constbuf[PIPE_SHADER_FRAGMENT],
-					fp, emit->prog->dirty & FD_SHADER_DIRTY_FP);
-		}
+	if (dirty & FD_DIRTY_PROG) {
+		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+		fd4_program_emit(ring, emit, pfb->nr_cbufs, pfb->cbufs);
 	}
 
-	/* emit driver params every time */
-	if (emit->info && emit->prog == &ctx->prog) {
-		uint32_t vertex_params[4] = {
-			emit->info->indexed ? emit->info->index_bias : emit->info->start,
-			0,
-			0,
-			0
-		};
-		if (vp->constlen >= vp->first_driver_param + 4) {
-			fd4_emit_constant(ring, SB_VERT_SHADER,
-							  (vp->first_driver_param + 4) * 4,
-							  0, 4, vertex_params, NULL);
-		}
+	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
+		ir3_emit_consts(vp, ring, emit->info, dirty);
+		if (!emit->key.binning_pass)
+			ir3_emit_consts(fp, ring, emit->info, dirty);
+		/* mark clean after emitting consts: */
+		ctx->prog.dirty = 0;
 	}
 
 	if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
 		struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend);
 		uint32_t i;
 
-		for (i = 0; i < 8; i++) {
+		for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 			OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 			OUT_RING(ring, blend->rb_mrt[i].control);
 
@@ -607,10 +625,10 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_GRAS_DEBUG_ECO_CONTROL, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0EC3, 1);
+	OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x00000006);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0F03, 1);
+	OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x0000003a);
 
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0D01, 1);
@@ -629,7 +647,7 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000012);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0E05, 1);
+	OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1);
 	OUT_RING(ring, 0x00000000);
 
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC5, 1);
@@ -752,9 +770,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1);
 	OUT_RING(ring, A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff));
 
-	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
-	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(0xf));
-
 	OUT_PKT0(ring, REG_A4XX_GRAS_CLEAR_CNTL, 1);
 	OUT_RING(ring, A4XX_GRAS_CLEAR_CNTL_NOT_FASTCLEAR);
 
@@ -763,3 +778,11 @@ fd4_emit_restore(struct fd_context *ctx)
 
 	ctx->needs_rb_fbd = true;
 }
+
+void
+fd4_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_const = fd4_emit_const;
+	ctx->emit_const_bo = fd4_emit_const_bo;
+}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index 7d059f8e532..ab7850e50b0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -37,15 +37,13 @@
 #include "ir3_shader.h"
 
 struct fd_ringbuffer;
-enum adreno_state_block;
 
-void fd4_emit_constant(struct fd_ringbuffer *ring,
-		enum adreno_state_block sb,
+void fd4_emit_const(struct fd_ringbuffer *ring, enum shader_t type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc);
 
 void fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring,
-		struct pipe_surface *psurf);
+		unsigned nr_bufs, struct pipe_surface **bufs);
 
 /* grouped together emit-state for prog/vertex/state emit: */
 struct fd4_emit {
@@ -53,10 +51,12 @@ struct fd4_emit {
 	const struct fd_program_stateobj *prog;
 	const struct pipe_draw_info *info;
 	struct ir3_shader_key key;
-	enum a4xx_color_fmt format;
-	enum pipe_format pformat;
 	uint32_t dirty;
 
+	uint32_t sprite_coord_enable;  /* bitmask */
+	bool sprite_coord_mode;
+	bool rasterflat;
+
 	/* cached to avoid repeated lookups of same variants: */
 	struct ir3_shader_variant *vp, *fp;
 	/* TODO: other shader stages.. */
@@ -96,4 +96,6 @@ void fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 void fd4_emit_restore(struct fd_context *ctx);
 
+void fd4_emit_init(struct pipe_context *pctx);
+
 #endif /* FD4_EMIT_H */
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
index 29abe0b0cc3..3e0045449eb 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
@@ -89,6 +89,14 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	_T(L8_UNORM,   8_UNORM, R8_UNORM, WZYX),
 	_T(I8_UNORM,   8_UNORM, NONE,     WZYX),
 
+	/* NOTE: should be TFMT_8_UINT (which then gets remapped to
+	 * TFMT_8_UNORM for mem2gmem in _gmem_restore_format()), but
+	 * we don't know TFMT_8_UINT yet.. so just use TFMT_8_UNORM
+	 * for now.. sampling from stencil as a texture might not
+	 * work right, but at least should be fine for zsbuf..
+	 */
+	_T(S8_UINT,    8_UNORM,  R8_UNORM, WZYX),
+
 	/* 16-bit */
 	V_(R16_UNORM,   16_UNORM, NONE,     WZYX),
 	V_(R16_SNORM,   16_SNORM, NONE,     WZYX),
@@ -96,7 +104,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R16_SINT,    16_SINT,  R16_SINT, WZYX),
 	V_(R16_USCALED, 16_UINT,  NONE,     WZYX),
 	V_(R16_SSCALED, 16_UINT,  NONE,     WZYX),
-	VT(R16_FLOAT,   16_FLOAT, NONE,     WZYX),
+	VT(R16_FLOAT,   16_FLOAT, R16_FLOAT,WZYX),
 
 	_T(A16_UINT,    16_UINT,  NONE,     WZYX),
 	_T(A16_SINT,    16_SINT,  NONE,     WZYX),
@@ -132,7 +140,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R32_SINT,    32_SINT,  R32_SINT, WZYX),
 	V_(R32_USCALED, 32_UINT,  NONE,     WZYX),
 	V_(R32_SSCALED, 32_UINT,  NONE,     WZYX),
-	VT(R32_FLOAT,   32_FLOAT, NONE,     WZYX),
+	VT(R32_FLOAT,   32_FLOAT, R32_FLOAT,WZYX),
 	V_(R32_FIXED,   32_FIXED, NONE,     WZYX),
 
 	_T(A32_UINT,    32_UINT,  NONE,     WZYX),
@@ -148,7 +156,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R16G16_SINT,    16_16_SINT,  R16G16_SINT, WZYX),
 	V_(R16G16_USCALED, 16_16_UINT,  NONE,        WZYX),
 	V_(R16G16_SSCALED, 16_16_SINT,  NONE,        WZYX),
-	VT(R16G16_FLOAT,   16_16_FLOAT, NONE,        WZYX),
+	VT(R16G16_FLOAT,   16_16_FLOAT, R16G16_FLOAT,WZYX),
 
 	_T(L16A16_UINT,    16_16_UINT,  NONE,        WZYX),
 	_T(L16A16_SINT,    16_16_SINT,  NONE,        WZYX),
@@ -191,7 +199,8 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 
 	_T(Z24X8_UNORM,       X8Z24_UNORM, R8G8B8A8_UNORM, WZYX),
 	_T(Z24_UNORM_S8_UINT, X8Z24_UNORM, R8G8B8A8_UNORM, WZYX),
-	/*_T(Z32_FLOAT,         Z32_FLOAT,   R8G8B8A8_UNORM, WZYX),*/
+	_T(Z32_FLOAT,         32_FLOAT,   R8G8B8A8_UNORM, WZYX),
+	_T(Z32_FLOAT_S8X24_UINT, 32_FLOAT,R8G8B8A8_UNORM, WZYX),
 
 	/* 48-bit */
 	V_(R16G16B16_UNORM,   16_16_16_UNORM, NONE, WZYX),
@@ -218,7 +227,7 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	VT(R32G32_SINT,    32_32_SINT,  R32G32_SINT, WZYX),
 	V_(R32G32_USCALED, 32_32_UINT,  NONE,        WZYX),
 	V_(R32G32_SSCALED, 32_32_SINT,  NONE,        WZYX),
-	VT(R32G32_FLOAT,   32_32_FLOAT, NONE,        WZYX),
+	VT(R32G32_FLOAT,   32_32_FLOAT, R32G32_FLOAT,WZYX),
 	V_(R32G32_FIXED,   32_32_FIXED, NONE,        WZYX),
 
 	_T(L32A32_UINT,    32_32_UINT,  NONE,        WZYX),
@@ -282,6 +291,9 @@ fd4_pipe2swap(enum pipe_format format)
 enum a4xx_tex_fetchsize
 fd4_pipe2fetchsize(enum pipe_format format)
 {
+	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+		format = PIPE_FORMAT_Z32_FLOAT;
+
 	switch (util_format_get_blocksizebits(format)) {
 	case 8:   return TFETCH4_1_BYTE;
 	case 16:  return TFETCH4_2_BYTE;
@@ -312,6 +324,8 @@ fd4_gmem_restore_format(enum pipe_format format)
 		return PIPE_FORMAT_R8G8B8A8_UNORM;
 	case PIPE_FORMAT_Z16_UNORM:
 		return PIPE_FORMAT_R8G8_UNORM;
+	case PIPE_FORMAT_S8_UINT:
+		return PIPE_FORMAT_R8_UNORM;
 	default:
 		return format;
 	}
@@ -328,6 +342,9 @@ fd4_pipe2depth(enum pipe_format format)
 	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_S8_UINT_Z24_UNORM:
 		return DEPTH4_24_8;
+	case PIPE_FORMAT_Z32_FLOAT:
+	case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+		return DEPTH4_32;
 	default:
 		return ~0;
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 9a905062071..81c37f72565 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -44,12 +44,6 @@
 #include "fd4_format.h"
 #include "fd4_zsa.h"
 
-static const struct ir3_shader_key key = {
-		// XXX should set this based on render target format!  We don't
-		// want half_precision if float32 render target!!!
-		.half_precision = true,
-};
-
 static void
 emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w)
@@ -63,7 +57,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		tile_mode = TILE4_LINEAR;
 	}
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 		enum a4xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
 		struct fd_resource *rsc = NULL;
@@ -74,11 +68,23 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 
 		if ((i < nr_bufs) && bufs[i]) {
 			struct pipe_surface *psurf = bufs[i];
+			enum pipe_format pformat = 0;
 
 			rsc = fd_resource(psurf->texture);
+			pformat = psurf->format;
+
+			/* In case we're drawing to Z32F_S8, the "color" actually goes to
+			 * the stencil
+			 */
+			if (rsc->stencil) {
+				rsc = rsc->stencil;
+				pformat = rsc->base.b.format;
+				bases++;
+			}
+
 			slice = fd_resource_slice(rsc, psurf->u.tex.level);
-			format = fd4_pipe2color(psurf->format);
-			swap = fd4_pipe2swap(psurf->format);
+			format = fd4_pipe2color(pformat);
+			swap = fd4_pipe2swap(pformat);
 
 			debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
@@ -94,6 +100,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			} else {
 				stride = slice->pitch * rsc->cpp;
 			}
+		} else if ((i < nr_bufs) && bases) {
+			base = bases[i];
 		}
 
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_BUF_INFO(i), 3);
@@ -101,7 +109,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 				A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap));
-		if (bin_w || (i >= nr_bufs)) {
+		if (bin_w || (i >= nr_bufs) || !bufs[i]) {
 			OUT_RING(ring, base);
 			OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride));
 		} else {
@@ -115,30 +123,26 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 	}
 }
 
-static uint32_t
-depth_base(struct fd_context *ctx)
-{
-	struct fd_gmem_stateobj *gmem = &ctx->gmem;
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
-	uint32_t cpp = 4;
-	if (pfb->cbufs[0]) {
-		struct fd_resource *rsc =
-				fd_resource(pfb->cbufs[0]->texture);
-		cpp = rsc->cpp;
-	}
-	return align(gmem->bin_w * gmem->bin_h * cpp, 0x4000);
-}
-
 /* transfer from gmem to system memory (ie. normal RAM) */
 
 static void
-emit_gmem2mem_surf(struct fd_context *ctx,
+emit_gmem2mem_surf(struct fd_context *ctx, bool stencil,
 		uint32_t base, struct pipe_surface *psurf)
 {
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct fd_resource *rsc = fd_resource(psurf->texture);
-	struct fd_resource_slice *slice = &rsc->slices[psurf->u.tex.level];
-	uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level,
+	enum pipe_format pformat = psurf->format;
+	struct fd_resource_slice *slice;
+	uint32_t offset;
+
+	if (stencil) {
+		debug_assert(rsc->stencil);
+		rsc = rsc->stencil;
+		pformat = rsc->base.b.format;
+	}
+
+	slice = &rsc->slices[psurf->u.tex.level];
+	offset = fd_resource_offset(rsc, psurf->u.tex.level,
 			psurf->u.tex.first_layer);
 
 	debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
@@ -150,10 +154,10 @@ emit_gmem2mem_surf(struct fd_context *ctx,
 	OUT_RELOCW(ring, rsc->bo, offset, 0, 0);   /* RB_COPY_DEST_BASE */
 	OUT_RING(ring, A4XX_RB_COPY_DEST_PITCH_PITCH(slice->pitch * rsc->cpp));
 	OUT_RING(ring, A4XX_RB_COPY_DEST_INFO_TILE(TILE4_LINEAR) |
-			A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(psurf->format)) |
+			A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(pformat)) |
 			A4XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) |
 			A4XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) |
-			A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(psurf->format)));
+			A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(pformat)));
 
 	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL);
@@ -163,13 +167,15 @@ static void
 fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 {
 	struct fd4_context *fd4_ctx = fd4_context(ctx);
+	struct fd_gmem_stateobj *gmem = &ctx->gmem;
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 			.vtx = &fd4_ctx->solid_vbuf_state,
 			.prog = &ctx->solid_prog,
-			.key = key,
-			.format = fd4_emit_format(pfb->cbufs[0]),
+			.key = {
+				.half_precision = true,
+			},
 	};
 
 	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
@@ -238,16 +244,26 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
 	OUT_RING(ring, 0);            /* ??? UNKNOWN_2209 */
 
-	fd4_program_emit(ring, &emit);
+	fd4_program_emit(ring, &emit, 0, NULL);
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	if (ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
-		uint32_t base = depth_base(ctx);
-		emit_gmem2mem_surf(ctx, base, pfb->zsbuf);
+		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
+		if (!rsc->stencil || (ctx->resolve & FD_BUFFER_DEPTH))
+			emit_gmem2mem_surf(ctx, false, ctx->gmem.zsbuf_base[0], pfb->zsbuf);
+		if (rsc->stencil && (ctx->resolve & FD_BUFFER_STENCIL))
+			emit_gmem2mem_surf(ctx, true, ctx->gmem.zsbuf_base[1], pfb->zsbuf);
 	}
 
 	if (ctx->resolve & FD_BUFFER_COLOR) {
-		emit_gmem2mem_surf(ctx, 0, pfb->cbufs[0]);
+		unsigned i;
+		for (i = 0; i < pfb->nr_cbufs; i++) {
+			if (!pfb->cbufs[i])
+				continue;
+			if (!(ctx->resolve & (PIPE_CLEAR_COLOR0 << i)))
+				continue;
+			emit_gmem2mem_surf(ctx, false, gmem->cbuf_base[i], pfb->cbufs[i]);
+		}
 	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
@@ -260,14 +276,25 @@ fd4_emit_tile_gmem2mem(struct fd_context *ctx, struct fd_tile *tile)
 /* transfer from system memory to gmem */
 
 static void
-emit_mem2gmem_surf(struct fd_context *ctx, uint32_t base,
-		struct pipe_surface *psurf, uint32_t bin_w)
+emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases,
+		struct pipe_surface **bufs, uint32_t nr_bufs, uint32_t bin_w)
 {
 	struct fd_ringbuffer *ring = ctx->ring;
+	struct pipe_surface *zsbufs[2];
 
-	emit_mrt(ring, 1, &psurf, &base, bin_w);
+	emit_mrt(ring, nr_bufs, bufs, bases, bin_w);
 
-	fd4_emit_gmem_restore_tex(ring, psurf);
+	if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) {
+		/* The gmem_restore_tex logic will put the first buffer's stencil
+		 * as color. Supply it with the proper information to make that
+		 * happen.
+		 */
+		zsbufs[0] = zsbufs[1] = bufs[0];
+		bufs = zsbufs;
+		nr_bufs = 2;
+	}
+
+	fd4_emit_gmem_restore_tex(ring, nr_bufs, bufs);
 
 	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL);
@@ -282,10 +309,14 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd4_emit emit = {
 			.vtx = &fd4_ctx->blit_vbuf_state,
+			.sprite_coord_enable = 1,
+			/* NOTE: They all use the same VP, this is for vtx bufs. */
 			.prog = &ctx->blit_prog[0],
-			.key = key,
-			.format = fd4_emit_format(pfb->cbufs[0]),
+			.key = {
+				.half_precision = fd_half_precision(pfb),
+			},
 	};
+	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
 	float x0, y0, x1, y1;
 	unsigned bin_w = tile->bin_w;
 	unsigned bin_h = tile->bin_h;
@@ -304,7 +335,9 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, fui(x1));
 	OUT_RING(ring, fui(y1));
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0;
+
 		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
 		OUT_RING(ring, A4XX_RB_MRT_CONTROL_FASTCLEAR |
 				A4XX_RB_MRT_CONTROL_B11 |
@@ -319,6 +352,16 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 				A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO));
 	}
 
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1);
+	OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+			A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+			A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+			A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+			A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+			A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+			A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+			A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+
 	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
 	OUT_RING(ring, 0x8);          /* XXX RB_RENDER_CONTROL */
 
@@ -381,7 +424,6 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, 0);            /* VFD_INDEX_OFFSET */
 	OUT_RING(ring, 0);            /* ??? UNKNOWN_2209 */
 
-	fd4_program_emit(ring, &emit);
 	fd4_emit_vertex_bufs(ring, &emit);
 
 	/* for gmem pitch/base calculations, we need to use the non-
@@ -390,11 +432,46 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 	bin_w = gmem->bin_w;
 	bin_h = gmem->bin_h;
 
-	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL))
-		emit_mem2gmem_surf(ctx, depth_base(ctx), pfb->zsbuf, bin_w);
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR)) {
+		emit.prog = &ctx->blit_prog[pfb->nr_cbufs - 1];
+		emit.fp = NULL;      /* frag shader changed so clear cache */
+		fd4_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs);
+		emit_mem2gmem_surf(ctx, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w);
+	}
 
-	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_COLOR))
-		emit_mem2gmem_surf(ctx, 0, pfb->cbufs[0], bin_w);
+	if (fd_gmem_needs_restore(ctx, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) {
+		switch (pfb->zsbuf->format) {
+		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+		case PIPE_FORMAT_Z32_FLOAT:
+			emit.prog = (pfb->zsbuf->format == PIPE_FORMAT_Z32_FLOAT) ?
+					&ctx->blit_z : &ctx->blit_zs;
+			emit.key.half_precision = false;
+
+			OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
+			OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_Z_ENABLE |
+					A4XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE |
+					A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_ALWAYS) |
+					A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE);
+
+			OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
+			OUT_RING(ring, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE);
+
+			OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1);
+			OUT_RING(ring, 0x80000);   /* GRAS_CL_CLIP_CNTL */
+
+			break;
+		default:
+			/* Non-float can use a regular color write. It's split over 8-bit
+			 * components, so half precision is always sufficient.
+			 */
+			emit.prog = &ctx->blit_prog[0];
+			emit.key.half_precision = true;
+			break;
+		}
+		emit.fp = NULL;      /* frag shader changed so clear cache */
+		fd4_program_emit(ring, &emit, 1, &pfb->zsbuf);
+		emit_mem2gmem_surf(ctx, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w);
+	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
 	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
@@ -534,21 +611,35 @@ fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile)
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd_gmem_stateobj *gmem = &ctx->gmem;
-	uint32_t reg;
 
-	OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
-	reg = A4XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base(ctx));
 	if (pfb->zsbuf) {
-		reg |= A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format));
-	}
-	OUT_RING(ring, reg);
-	if (pfb->zsbuf) {
-		uint32_t cpp = util_format_get_blocksize(pfb->zsbuf->format);
+		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
+		uint32_t cpp = rsc->cpp;
+
+		OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
+		OUT_RING(ring, A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]) |
+				A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format)));
 		OUT_RING(ring, A4XX_RB_DEPTH_PITCH(cpp * gmem->bin_w));
 		OUT_RING(ring, A4XX_RB_DEPTH_PITCH2(cpp * gmem->bin_w));
+
+		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2);
+		if (rsc->stencil) {
+			OUT_RING(ring, A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL |
+					A4XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1]));
+			OUT_RING(ring, A4XX_RB_STENCIL_PITCH(rsc->stencil->cpp * gmem->bin_w));
+		} else {
+			OUT_RING(ring, 0x00000000);
+			OUT_RING(ring, 0x00000000);
+		}
 	} else {
+		OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3);
 		OUT_RING(ring, 0x00000000);
 		OUT_RING(ring, 0x00000000);
+		OUT_RING(ring, 0x00000000);
+
+		OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2);
+		OUT_RING(ring, 0);            /* RB_STENCIL_INFO */
+		OUT_RING(ring, 0);            /* RB_STENCIL_PITCH */
 	}
 
 	OUT_PKT0(ring, REG_A4XX_GRAS_DEPTH_CONTROL, 1);
@@ -586,7 +677,7 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
 	OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2));
 
-	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, gmem->bin_w);
+	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w);
 
 	/* setup scissor/offset for current tile: */
 	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index e8f5837f7ce..1a6d0142132 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -31,8 +31,6 @@
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_parse.h"
 
 #include "freedreno_program.h"
 
@@ -53,7 +51,7 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state
 		enum shader_t type)
 {
 	struct fd4_shader_stateobj *so = CALLOC_STRUCT(fd4_shader_stateobj);
-	so->shader = ir3_shader_create(pctx, cso->tokens, type);
+	so->shader = ir3_shader_create(pctx, cso, type);
 	return so;
 }
 
@@ -213,14 +211,17 @@ setup_stages(struct fd4_emit *emit, struct stage *s)
 }
 
 void
-fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
+fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
+		int nr, struct pipe_surface **bufs)
 {
 	struct stage s[MAX_STAGES];
-	uint32_t pos_regid, posz_regid, psize_regid, color_regid;
+	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
 	uint32_t face_regid, coord_regid, zwcoord_regid;
 	int constmode;
 	int i, j, k;
 
+	debug_assert(nr <= ARRAY_SIZE(color_regid));
+
 	setup_stages(emit, s);
 
 	/* blob seems to always use constmode currently: */
@@ -232,11 +233,30 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
 	psize_regid = ir3_find_output_regid(s[VS].v,
 		ir3_semantic_name(TGSI_SEMANTIC_PSIZE, 0));
-	color_regid = ir3_find_output_regid(s[FS].v,
-		ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
+	if (s[FS].v->color0_mrt) {
+		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
+		color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
+			ir3_find_output_regid(s[FS].v, ir3_semantic_name(TGSI_SEMANTIC_COLOR, 0));
+	} else {
+		const struct ir3_shader_variant *fp = s[FS].v;
+		memset(color_regid, 0, sizeof(color_regid));
+		for (i = 0; i < fp->outputs_count; i++) {
+			ir3_semantic sem = fp->outputs[i].semantic;
+			unsigned idx = sem2idx(sem);
+			if (sem2name(sem) != TGSI_SEMANTIC_COLOR)
+				continue;
+			debug_assert(idx < ARRAY_SIZE(color_regid));
+			color_regid[idx] = fp->outputs[i].regid;
+		}
+	}
+
+	/* adjust regids for alpha output formats. there is no alpha render
+	 * format, so it's just treated like red
+	 */
+	for (i = 0; i < nr; i++)
+		if (util_format_is_alpha(pipe_surface_format(bufs[i])))
+			color_regid[i] += 3;
 
-	if (util_format_is_alpha(emit->pformat))
-		color_regid += 3;
 
 	/* TODO get these dynamically: */
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
@@ -419,29 +439,24 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 					A4XX_RB_RENDER_CONTROL2_WCOORD));
 
 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
-	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(1) |
+	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
 			COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z));
 
 	OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1);
-	if (s[FS].v->writes_pos) {
-		OUT_RING(ring, 0x00000001 |
-				A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE |
-				A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
-	} else {
-		OUT_RING(ring, 0x00000001);
-	}
+	OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
+			COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
+			A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
 
 	OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid) |
-			A4XX_SP_FS_MRT_REG_MRTFORMAT(emit->format) |
-			COND(emit->key.half_precision, A4XX_SP_FS_MRT_REG_HALF_PRECISION));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
-	OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(0));
+	for (i = 0; i < 8; i++) {
+		enum a4xx_color_fmt format = 0;
+		if (i < nr)
+			format = fd4_emit_format(bufs[i]);
+		OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
+				A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
+				COND(emit->key.half_precision,
+					A4XX_SP_FS_MRT_REG_HALF_PRECISION));
+	}
 
 	if (emit->key.binning_pass) {
 		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
@@ -450,10 +465,10 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 				COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
 		OUT_RING(ring, 0x00000000);
 	} else {
-		uint32_t vinterp[8], flatshade[2];
+		uint32_t vinterp[8], vpsrepl[8];
 
 		memset(vinterp, 0, sizeof(vinterp));
-		memset(flatshade, 0, sizeof(flatshade));
+		memset(vpsrepl, 0, sizeof(vpsrepl));
 
 		/* looks like we need to do int varyings in the frag
 		 * shader on a4xx (no flatshad reg?  or a420.0 bug?):
@@ -470,29 +485,40 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 		 * something like the code below instead of workaround
 		 * in the shader:
 		 */
-#if 0
-		/* figure out VARYING_INTERP / FLAT_SHAD register values: */
+		/* figure out VARYING_INTERP / VARYING_PS_REPL register values: */
 		for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) {
 			uint32_t interp = s[FS].v->inputs[j].interpolate;
+
+			/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
+			 * instead.. rather than -8 everywhere else..
+			 */
+			uint32_t inloc = s[FS].v->inputs[j].inloc - 8;
+
+			/* currently assuming varyings aligned to 4 (not
+			 * packed):
+			 */
+			debug_assert((inloc % 4) == 0);
+
 			if ((interp == TGSI_INTERPOLATE_CONSTANT) ||
 					((interp == TGSI_INTERPOLATE_COLOR) && emit->rasterflat)) {
-				/* TODO might be cleaner to just +8 in SP_VS_VPC_DST_REG
-				 * instead.. rather than -8 everywhere else..
-				 */
-				uint32_t loc = s[FS].v->inputs[j].inloc - 8;
-
-				/* currently assuming varyings aligned to 4 (not
-				 * packed):
-				 */
-				debug_assert((loc % 4) == 0);
+				uint32_t loc = inloc;
 
 				for (i = 0; i < 4; i++, loc++) {
 					vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
-					flatshade[loc / 32] |= 1 << (loc % 32);
+					//flatshade[loc / 32] |= 1 << (loc % 32);
 				}
 			}
+
+			/* Replace the .xy coordinates with S/T from the point sprite. Set
+			 * interpolation bits for .zw such that they become .01
+			 */
+			if (emit->sprite_coord_enable & (1 << sem2idx(s[FS].v->inputs[j].semantic))) {
+				vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09)
+					<< ((inloc % 16) * 2);
+				vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2);
+				vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2);
+			}
 		}
-#endif
 
 		OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
 		OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) |
@@ -509,7 +535,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 
 		OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8);
 		for (i = 0; i < 8; i++)
-			OUT_RING(ring, s[FS].v->shader->vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
+			OUT_RING(ring, vpsrepl[i]);   /* VPC_VARYING_PS_REPL[i] */
 	}
 
 	if (s[VS].instrlen)
@@ -520,19 +546,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 			emit_shader(ring, s[FS].v);
 }
 
-/* hack.. until we figure out how to deal w/ vpsrepl properly.. */
-static void
-fix_blit_fp(struct pipe_context *pctx)
-{
-	struct fd_context *ctx = fd_context(pctx);
-	struct fd4_shader_stateobj *so = ctx->blit_prog[0].fp;
-
-	so->shader->vpsrepl[0] = 0x99999999;
-	so->shader->vpsrepl[1] = 0x99999999;
-	so->shader->vpsrepl[2] = 0x99999999;
-	so->shader->vpsrepl[3] = 0x99999999;
-}
-
 void
 fd4_prog_init(struct pipe_context *pctx)
 {
@@ -543,6 +556,4 @@ fd4_prog_init(struct pipe_context *pctx)
 	pctx->delete_vs_state = fd4_vp_state_delete;
 
 	fd_prog_init(pctx);
-
-	fix_blit_fp(pctx);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
index 52306a4c60d..8dfccaf9d74 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
@@ -39,7 +39,8 @@ struct fd4_shader_stateobj {
 
 struct fd4_emit;
 
-void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit);
+void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
+		int nr, struct pipe_surface **bufs);
 
 void fd4_prog_init(struct pipe_context *pctx);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
index 6db1c11b94b..4f69e0c1694 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
@@ -31,9 +31,93 @@
 #include "freedreno_util.h"
 
 #include "fd4_query.h"
+#include "fd4_draw.h"
 #include "fd4_format.h"
 
+
+struct fd_rb_samp_ctrs {
+	uint64_t ctr[16];
+};
+
+/*
+ * Occlusion Query:
+ *
+ * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
+ * interpret results
+ */
+
+static struct fd_hw_sample *
+occlusion_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+	struct fd_hw_sample *samp =
+			fd_hw_sample_init(ctx, sizeof(struct fd_rb_samp_ctrs));
+
+	/* low bits of sample addr should be zero (since they are control
+	 * flags in RB_SAMPLE_COUNT_CONTROL):
+	 */
+	debug_assert((samp->offset & 0x3) == 0);
+
+	/* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
+	 * HW_QUERY_BASE_REG register:
+	 */
+	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+	OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
+	OUT_RING(ring, HW_QUERY_BASE_REG);
+	OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY |
+			samp->offset);
+
+	OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
+	OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
+						INDEX4_SIZE_32_BIT, USE_VISIBILITY));
+	OUT_RING(ring, 1);             /* NumInstances */
+	OUT_RING(ring, 0);             /* NumIndices */
+
+	fd_event_write(ctx, ring, ZPASS_DONE);
+
+	return samp;
+}
+
+static uint64_t
+count_samples(const struct fd_rb_samp_ctrs *start,
+		const struct fd_rb_samp_ctrs *end)
+{
+	return end->ctr[0] - start->ctr[0];
+}
+
+static void
+occlusion_counter_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = count_samples(start, end);
+	result->u64 += n;
+}
+
+static void
+occlusion_predicate_accumulate_result(struct fd_context *ctx,
+		const void *start, const void *end,
+		union pipe_query_result *result)
+{
+	uint64_t n = count_samples(start, end);
+	result->b |= (n > 0);
+}
+
+static const struct fd_hw_sample_provider occlusion_counter = {
+		.query_type = PIPE_QUERY_OCCLUSION_COUNTER,
+		.active = FD_STAGE_DRAW,
+		.get_sample = occlusion_get_sample,
+		.accumulate_result = occlusion_counter_accumulate_result,
+};
+
+static const struct fd_hw_sample_provider occlusion_predicate = {
+		.query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
+		.active = FD_STAGE_DRAW,
+		.get_sample = occlusion_get_sample,
+		.accumulate_result = occlusion_predicate_accumulate_result,
+};
+
 void fd4_query_context_init(struct pipe_context *pctx)
 {
-	/* TODO */
+	fd_hw_query_register_provider(pctx, &occlusion_counter);
+	fd_hw_query_register_provider(pctx, &occlusion_predicate);
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
index e54b606a285..dc7e98b149d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
@@ -50,7 +50,7 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 
 	if (cso->point_size_per_vertex) {
 		psize_min = util_get_min_point_size(cso);
-		psize_max = 8192;
+		psize_max = 4092;
 	} else {
 		/* Force the point size to be as if the vertex output was disabled. */
 		psize_min = cso->point_size;
@@ -67,9 +67,9 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 */
 	so->gras_cl_clip_cntl = 0x80000; /* ??? */
 	so->gras_su_point_minmax =
-			A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min/2) |
-			A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max/2);
-	so->gras_su_point_size   = A4XX_GRAS_SU_POINT_SIZE(cso->point_size/2);
+			A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) |
+			A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max);
+	so->gras_su_point_size   = A4XX_GRAS_SU_POINT_SIZE(cso->point_size);
 	so->gras_su_poly_offset_scale =
 			A4XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale);
 	so->gras_su_poly_offset_offset =
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
index 06c728f2f1f..64e81a9983b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h
@@ -44,7 +44,7 @@ struct fd4_rasterizer_stateobj {
 	uint32_t pc_prim_vtx_cntl;
 };
 
-static INLINE struct fd4_rasterizer_stateobj *
+static inline struct fd4_rasterizer_stateobj *
 fd4_rasterizer_stateobj(struct pipe_rasterizer_state *rast)
 {
 	return (struct fd4_rasterizer_stateobj *)rast;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index e8cbb2d201a..d8ea414f300 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -102,7 +102,7 @@ void
 fd4_screen_init(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
-	screen->max_rts = 1;
+	screen->max_rts = A4XX_MAX_RENDER_TARGETS;
 	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd4_context_create;
 	pscreen->is_format_supported = fd4_screen_is_format_supported;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index 6ba25d0816d..d2bc5fee6c0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -150,8 +150,8 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 {
 	struct fd4_pipe_sampler_view *so = CALLOC_STRUCT(fd4_pipe_sampler_view);
 	struct fd_resource *rsc = fd_resource(prsc);
-	unsigned lvl = cso->u.tex.first_level;
-	unsigned miplevels = cso->u.tex.last_level - lvl;
+	unsigned lvl = fd_sampler_first_level(cso);
+	unsigned miplevels = fd_sampler_last_level(cso) - lvl;
 
 	if (!so)
 		return NULL;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
index 579ed87f14b..84ee7ecb50c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h
@@ -42,7 +42,7 @@ struct fd4_sampler_stateobj {
 	uint32_t texsamp0, texsamp1;
 };
 
-static INLINE struct fd4_sampler_stateobj *
+static inline struct fd4_sampler_stateobj *
 fd4_sampler_stateobj(struct pipe_sampler_state *samp)
 {
 	return (struct fd4_sampler_stateobj *)samp;
@@ -53,7 +53,7 @@ struct fd4_pipe_sampler_view {
 	uint32_t texconst0, texconst1, texconst2, texconst3, textconst4;
 };
 
-static INLINE struct fd4_pipe_sampler_view *
+static inline struct fd4_pipe_sampler_view *
 fd4_pipe_sampler_view(struct pipe_sampler_view *pview)
 {
 	return (struct fd4_pipe_sampler_view *)pview;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
index 033317cf620..6a92a9b6785 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
@@ -47,7 +47,7 @@ struct fd4_zsa_stateobj {
 	uint32_t rb_stencilrefmask_bf;
 };
 
-static INLINE struct fd4_zsa_stateobj *
+static inline struct fd4_zsa_stateobj *
 fd4_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa)
 {
 	return (struct fd4_zsa_stateobj *)zsa;
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index b23aa830770..00b6acba065 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -8,15 +8,15 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
-Copyright (C) 2013-2014 by the following authors:
+Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 
 Permission is hereby granted, free of charge, to any person obtaining
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 2b24c5b4e78..98a90e26679 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -8,13 +8,13 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2013-11-30 14:47:15)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2013-03-31 16:51:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2014-06-02 15:21:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2014-11-13 22:44:30)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14895 bytes, from 2015-04-19 15:23:28)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-04-12 18:16:35)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  59314 bytes, from 2015-04-19 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -67,7 +67,7 @@ enum vgt_event_type {
 
 enum pc_di_primtype {
 	DI_PT_NONE = 0,
-	DI_PT_POINTLIST_A2XX = 1,
+	DI_PT_POINTLIST_PSIZE = 1,
 	DI_PT_LINELIST = 2,
 	DI_PT_LINESTRIP = 3,
 	DI_PT_TRILIST = 4,
@@ -75,7 +75,7 @@ enum pc_di_primtype {
 	DI_PT_TRISTRIP = 6,
 	DI_PT_LINELOOP = 7,
 	DI_PT_RECTLIST = 8,
-	DI_PT_POINTLIST_A3XX = 9,
+	DI_PT_POINTLIST = 9,
 	DI_PT_LINE_ADJ = 10,
 	DI_PT_LINESTRIP_ADJ = 11,
 	DI_PT_TRI_ADJ = 12,
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 668ef3629bf..8e6d43150ce 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -94,9 +94,7 @@ void
 fd_context_render(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
-	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct fd_resource *rsc, *rsc_tmp;
-	int i;
 
 	DBG("needs_flush: %d", ctx->needs_flush);
 
@@ -118,20 +116,11 @@ fd_context_render(struct pipe_context *pctx)
 	ctx->gmem_reason = 0;
 	ctx->num_draws = 0;
 
-	for (i = 0; i < pfb->nr_cbufs; i++)
-		if (pfb->cbufs[i])
-			fd_resource(pfb->cbufs[i]->texture)->dirty = false;
-	if (pfb->zsbuf) {
-		rsc = fd_resource(pfb->zsbuf->texture);
-		rsc->dirty = false;
-		if (rsc->stencil)
-			rsc->stencil->dirty = false;
-	}
-
 	/* go through all the used resources and clear their reading flag */
 	LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &ctx->used_resources, list) {
-		assert(rsc->reading);
-		rsc->reading = false;
+		debug_assert(rsc->status != 0);
+		rsc->status = 0;
+		rsc->pending_ctx = NULL;
 		list_delinit(&rsc->list);
 	}
 
@@ -144,8 +133,10 @@ fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 {
 	fd_context_render(pctx);
 
-	if (fence)
+	if (fence) {
+		fd_screen_fence_ref(pctx->screen, fence, NULL);
 		*fence = fd_fence_create(pctx);
+	}
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index e420f1e5bd9..509a90fdf23 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -82,6 +82,20 @@ struct fd_vertex_stateobj {
 	unsigned num_elements;
 };
 
+struct fd_streamout_stateobj {
+	struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+	unsigned num_targets;
+	/* Track offset from vtxcnt for streamout data.  This counter
+	 * is just incremented by # of vertices on each draw until
+	 * reset or new streamout buffer bound.
+	 *
+	 * When we eventually have GS, the CPU won't actually know the
+	 * number of vertices per draw, so I think we'll have to do
+	 * something more clever.
+	 */
+	unsigned offsets[PIPE_MAX_SO_BUFFERS];
+};
+
 /* group together the vertex and vertexbuf state.. for ease of passing
  * around, and because various internal operations (gmem<->mem, etc)
  * need their own vertex state:
@@ -179,7 +193,7 @@ struct fd_context {
 	struct fd_program_stateobj solid_prog; // TODO move to screen?
 
 	/* shaders used by mem->gmem blits: */
-	struct fd_program_stateobj blit_prog[8]; // TODO move to screen?
+	struct fd_program_stateobj blit_prog[MAX_RENDER_TARGETS]; // TODO move to screen?
 	struct fd_program_stateobj blit_z, blit_zs;
 
 	/* do we need to mem2gmem before rendering.  We don't, if for example,
@@ -319,6 +333,7 @@ struct fd_context {
 		FD_DIRTY_VTXBUF      = (1 << 15),
 		FD_DIRTY_INDEXBUF    = (1 << 16),
 		FD_DIRTY_SCISSOR     = (1 << 17),
+		FD_DIRTY_STREAMOUT   = (1 << 18),
 	} dirty;
 
 	struct pipe_blend_state *blend;
@@ -339,6 +354,7 @@ struct fd_context {
 	struct pipe_viewport_state viewport;
 	struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
 	struct pipe_index_buffer indexbuf;
+	struct fd_streamout_stateobj streamout;
 
 	/* GMEM/tile handling fxns: */
 	void (*emit_tile_init)(struct fd_context *ctx);
@@ -351,18 +367,25 @@ struct fd_context {
 	void (*emit_sysmem_prep)(struct fd_context *ctx);
 
 	/* draw: */
-	void (*draw_vbo)(struct fd_context *pctx, const struct pipe_draw_info *info);
+	void (*draw_vbo)(struct fd_context *ctx, const struct pipe_draw_info *info);
 	void (*clear)(struct fd_context *ctx, unsigned buffers,
 			const union pipe_color_union *color, double depth, unsigned stencil);
+
+	/* constant emit:  (note currently not used/needed for a2xx) */
+	void (*emit_const)(struct fd_ringbuffer *ring, enum shader_t type,
+			uint32_t regid, uint32_t offset, uint32_t sizedwords,
+			const uint32_t *dwords, struct pipe_resource *prsc);
+	void (*emit_const_bo)(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
+			uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets);
 };
 
-static INLINE struct fd_context *
+static inline struct fd_context *
 fd_context(struct pipe_context *pctx)
 {
 	return (struct fd_context *)pctx;
 }
 
-static INLINE struct pipe_scissor_state *
+static inline struct pipe_scissor_state *
 fd_context_get_scissor(struct fd_context *ctx)
 {
 	if (ctx->rasterizer && ctx->rasterizer->scissor)
@@ -370,13 +393,13 @@ fd_context_get_scissor(struct fd_context *ctx)
 	return &ctx->disabled_scissor;
 }
 
-static INLINE bool
+static inline bool
 fd_supported_prim(struct fd_context *ctx, unsigned prim)
 {
 	return (1 << prim) & ctx->primtype_mask;
 }
 
-static INLINE void
+static inline void
 fd_reset_wfi(struct fd_context *ctx)
 {
 	ctx->needs_wfi = true;
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index c9e317c7dc9..6831a58749c 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -40,7 +40,8 @@
 #include "freedreno_util.h"
 
 static void
-resource_reading(struct fd_context *ctx, struct pipe_resource *prsc)
+resource_used(struct fd_context *ctx, struct pipe_resource *prsc,
+		enum fd_resource_status status)
 {
 	struct fd_resource *rsc;
 
@@ -48,9 +49,29 @@ resource_reading(struct fd_context *ctx, struct pipe_resource *prsc)
 		return;
 
 	rsc = fd_resource(prsc);
-	rsc->reading = true;
+	rsc->status |= status;
+	if (rsc->stencil)
+		rsc->stencil->status |= status;
+
+	/* TODO resources can actually be shared across contexts,
+	 * so I'm not sure a single list-head will do the trick?
+	 */
+	debug_assert((rsc->pending_ctx == ctx) || !rsc->pending_ctx);
 	list_delinit(&rsc->list);
 	list_addtail(&rsc->list, &ctx->used_resources);
+	rsc->pending_ctx = ctx;
+}
+
+static void
+resource_read(struct fd_context *ctx, struct pipe_resource *prsc)
+{
+	resource_used(ctx, prsc, FD_PENDING_READ);
+}
+
+static void
+resource_written(struct fd_context *ctx, struct pipe_resource *prsc)
+{
+	resource_used(ctx, prsc, FD_PENDING_WRITE);
 }
 
 static void
@@ -59,7 +80,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	struct fd_context *ctx = fd_context(pctx);
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 	struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
-	unsigned i, buffers = 0;
+	unsigned i, prims, buffers = 0;
 
 	/* if we supported transform feedback, we'd have to disable this: */
 	if (((scissor->maxx - scissor->minx) *
@@ -69,6 +90,8 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	/* emulate unsupported primitives: */
 	if (!fd_supported_prim(ctx, info->mode)) {
+		if (ctx->streamout.num_targets > 0)
+			debug_error("stream-out with emulated prims");
 		util_primconvert_save_index_buffer(ctx->primconvert, &ctx->indexbuf);
 		util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer);
 		util_primconvert_draw_vbo(ctx->primconvert, info);
@@ -83,17 +106,13 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	if (fd_depth_enabled(ctx)) {
 		buffers |= FD_BUFFER_DEPTH;
-		fd_resource(pfb->zsbuf->texture)->dirty = true;
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_DEPTH_ENABLED;
 	}
 
 	if (fd_stencil_enabled(ctx)) {
-		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
 		buffers |= FD_BUFFER_STENCIL;
-		if (rsc->stencil)
-			rsc->stencil->dirty = true;
-		else
-			rsc->dirty = true;
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_STENCIL_ENABLED;
 	}
 
@@ -108,7 +127,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 		surf = pfb->cbufs[i]->texture;
 
-		fd_resource(surf)->dirty = true;
+		resource_written(ctx, surf);
 		buffers |= PIPE_CLEAR_COLOR0 << i;
 
 		if (surf->nr_samples > 1)
@@ -120,32 +139,38 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 	/* Skip over buffer 0, that is sent along with the command stream */
 	for (i = 1; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-		resource_reading(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer);
-		resource_reading(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer);
+		resource_read(ctx, ctx->constbuf[PIPE_SHADER_VERTEX].cb[i].buffer);
+		resource_read(ctx, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb[i].buffer);
 	}
 
 	/* Mark VBOs as being read */
 	for (i = 0; i < ctx->vtx.vertexbuf.count; i++) {
 		assert(!ctx->vtx.vertexbuf.vb[i].user_buffer);
-		resource_reading(ctx, ctx->vtx.vertexbuf.vb[i].buffer);
+		resource_read(ctx, ctx->vtx.vertexbuf.vb[i].buffer);
 	}
 
 	/* Mark index buffer as being read */
-	resource_reading(ctx, ctx->indexbuf.buffer);
+	resource_read(ctx, ctx->indexbuf.buffer);
 
 	/* Mark textures as being read */
 	for (i = 0; i < ctx->verttex.num_textures; i++)
 		if (ctx->verttex.textures[i])
-			resource_reading(ctx, ctx->verttex.textures[i]->texture);
+			resource_read(ctx, ctx->verttex.textures[i]->texture);
 	for (i = 0; i < ctx->fragtex.num_textures; i++)
 		if (ctx->fragtex.textures[i])
-			resource_reading(ctx, ctx->fragtex.textures[i]->texture);
+			resource_read(ctx, ctx->fragtex.textures[i]->texture);
+
+	/* Mark streamout buffers as being written.. */
+	for (i = 0; i < ctx->streamout.num_targets; i++)
+		if (ctx->streamout.targets[i])
+			resource_written(ctx, ctx->streamout.targets[i]->buffer);
 
 	ctx->num_draws++;
 
+	prims = u_reduced_prims_for_vertices(info->mode, info->count);
+
 	ctx->stats.draw_calls++;
-	ctx->stats.prims_emitted +=
-		u_reduced_prims_for_vertices(info->mode, info->count);
+	ctx->stats.prims_emitted += prims;
 
 	/* any buffers that haven't been cleared yet, we need to restore: */
 	ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared);
@@ -159,6 +184,9 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_DRAW);
 	ctx->draw_vbo(ctx, info);
 
+	for (i = 0; i < ctx->streamout.num_targets; i++)
+		ctx->streamout.offsets[i] += prims;
+
 	/* if an app (or, well, piglit test) does many thousands of draws
 	 * without flush (or anything which implicitly flushes, like
 	 * changing render targets), we can exceed the ringbuffer size.
@@ -216,15 +244,10 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR)
 		for (i = 0; i < pfb->nr_cbufs; i++)
 			if (buffers & (PIPE_CLEAR_COLOR0 << i))
-				fd_resource(pfb->cbufs[i]->texture)->dirty = true;
+				resource_written(ctx, pfb->cbufs[i]->texture);
 
 	if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
-		if (rsc->stencil && buffers & PIPE_CLEAR_STENCIL)
-			rsc->stencil->dirty = true;
-		if (!rsc->stencil || buffers & PIPE_CLEAR_DEPTH)
-			rsc->dirty = true;
-
+		resource_written(ctx, pfb->zsbuf->texture);
 		ctx->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL;
 	}
 
@@ -242,7 +265,8 @@ fd_clear(struct pipe_context *pctx, unsigned buffers,
 			FD_DIRTY_SAMPLE_MASK |
 			FD_DIRTY_PROG |
 			FD_DIRTY_CONSTBUF |
-			FD_DIRTY_BLEND;
+			FD_DIRTY_BLEND |
+			FD_DIRTY_FRAMEBUFFER;
 
 	if (fd_mesa_debug & FD_DBG_DCLEAR)
 		ctx->dirty = 0xffffffff;
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 375e58f7022..04a9feacd58 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -69,6 +69,9 @@ boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *fence,
 		uint64_t timeout)
 {
+	if (!timeout)
+		return fd_screen_fence_signalled(screen, fence);
+
 	if (fd_pipe_wait(fence->screen->pipe, fence->timestamp))
 		return false;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index c105378ec4e..648db9baee5 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -82,7 +82,7 @@ total_size(uint8_t cbuf_cpp[], uint8_t zsbuf_cpp[2],
 {
 	uint32_t total = 0, i;
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < MAX_RENDER_TARGETS; i++) {
 		if (cbuf_cpp[i]) {
 			gmem->cbuf_base[i] = align(total, 0x4000);
 			total = gmem->cbuf_base[i] + cbuf_cpp[i] * bin_w * bin_h;
@@ -113,7 +113,7 @@ calculate_tiles(struct fd_context *ctx)
 	uint32_t nbins_x = 1, nbins_y = 1;
 	uint32_t bin_w, bin_h;
 	uint32_t max_width = bin_width(ctx);
-	uint8_t cbuf_cpp[4] = {0}, zsbuf_cpp[2] = {0};
+	uint8_t cbuf_cpp[MAX_RENDER_TARGETS] = {0}, zsbuf_cpp[2] = {0};
 	uint32_t i, j, t, xoff, yoff;
 	uint32_t tpp_x, tpp_y;
 	bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
@@ -162,12 +162,17 @@ calculate_tiles(struct fd_context *ctx)
 		bin_w = align(width / nbins_x, 32);
 	}
 
+	if (fd_mesa_debug & FD_DBG_MSGS) {
+		debug_printf("binning input: cbuf cpp:");
+		for (i = 0; i < pfb->nr_cbufs; i++)
+			debug_printf(" %d", cbuf_cpp[i]);
+		debug_printf(", zsbuf cpp: %d; %dx%d\n",
+				zsbuf_cpp[0], width, height);
+	}
+
 	/* then find a bin width/height that satisfies the memory
 	 * constraints:
 	 */
-	DBG("binning input: cbuf cpp: %d %d %d %d, zsbuf cpp: %d; %dx%d",
-		cbuf_cpp[0], cbuf_cpp[1], cbuf_cpp[2], cbuf_cpp[3], zsbuf_cpp[0],
-		width, height);
 	while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem) > gmem_size) {
 		if (bin_w > bin_h) {
 			nbins_x++;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h
index 5867235db90..38b557eb077 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.h
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.h
@@ -31,6 +31,8 @@
 
 #include "pipe/p_context.h"
 
+#include "freedreno_util.h"
+
 /* per-pipe configuration for hw binning: */
 struct fd_vsc_pipe {
 	struct fd_bo *bo;
@@ -47,9 +49,9 @@ struct fd_tile {
 
 struct fd_gmem_stateobj {
 	struct pipe_scissor_state scissor;
-	uint32_t cbuf_base[4];
+	uint32_t cbuf_base[MAX_RENDER_TARGETS];
 	uint32_t zsbuf_base[2];
-	uint8_t cbuf_cpp[4];
+	uint8_t cbuf_cpp[MAX_RENDER_TARGETS];
 	uint8_t zsbuf_cpp[2];
 	uint16_t bin_h, nbins_y;
 	uint16_t bin_w, nbins_x;
diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c
index 5e344e69146..e6a647852a3 100644
--- a/src/gallium/drivers/freedreno/freedreno_program.c
+++ b/src/gallium/drivers/freedreno/freedreno_program.c
@@ -96,7 +96,11 @@ fd_prog_blit(struct pipe_context *pctx, int rts, bool depth)
 {
 	int i;
 	struct ureg_src tc;
-	struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
+	struct ureg_program *ureg;
+
+	debug_assert(rts <= MAX_RENDER_TARGETS);
+
+	ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
 	if (!ureg)
 		return NULL;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 95f79df565e..709ad4eb55b 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -42,6 +42,14 @@
 
 #include <errno.h>
 
+
+static bool
+pending(struct fd_resource *rsc, enum fd_resource_status status)
+{
+	return (rsc->status & status) ||
+		(rsc->stencil && (rsc->stencil->status & status));
+}
+
 static void
 fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc)
 {
@@ -72,11 +80,11 @@ fd_invalidate_resource(struct fd_context *ctx, struct pipe_resource *prsc)
 
 	/* Textures */
 	for (i = 0; i < ctx->verttex.num_textures && !(ctx->dirty & FD_DIRTY_VERTTEX); i++) {
-		if (ctx->verttex.textures[i]->texture == prsc)
+		if (ctx->verttex.textures[i] && (ctx->verttex.textures[i]->texture == prsc))
 			ctx->dirty |= FD_DIRTY_VERTTEX;
 	}
 	for (i = 0; i < ctx->fragtex.num_textures && !(ctx->dirty & FD_DIRTY_FRAGTEX); i++) {
-		if (ctx->fragtex.textures[i]->texture == prsc)
+		if (ctx->fragtex.textures[i] && (ctx->fragtex.textures[i]->texture == prsc))
 			ctx->dirty |= FD_DIRTY_FRAGTEX;
 	}
 }
@@ -97,7 +105,8 @@ realloc_bo(struct fd_resource *rsc, uint32_t size)
 
 	rsc->bo = fd_bo_new(screen->dev, size, flags);
 	rsc->timestamp = 0;
-	rsc->dirty = rsc->reading = false;
+	rsc->status = 0;
+	rsc->pending_ctx = NULL;
 	list_delinit(&rsc->list);
 	util_range_set_empty(&rsc->valid_buffer_range);
 }
@@ -238,8 +247,9 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 		/* If the GPU is writing to the resource, or if it is reading from the
 		 * resource and we're trying to write to it, flush the renders.
 		 */
-		if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty) ||
-			((ptrans->usage & PIPE_TRANSFER_WRITE) && rsc->reading))
+		if (((ptrans->usage & PIPE_TRANSFER_WRITE) &&
+					pending(rsc, FD_PENDING_READ | FD_PENDING_WRITE)) ||
+				pending(rsc, FD_PENDING_WRITE))
 			fd_context_render(pctx);
 
 		/* The GPU keeps track of how the various bo's are being used, and
@@ -646,6 +656,8 @@ fd_blitter_pipe_begin(struct fd_context *ctx)
 	util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb);
 	util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx);
 	util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vp);
+	util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets,
+			ctx->streamout.targets);
 	util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer);
 	util_blitter_save_viewport(ctx->blitter, &ctx->viewport);
 	util_blitter_save_scissor(ctx->blitter, &ctx->scissor);
@@ -675,7 +687,7 @@ fd_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
 {
 	struct fd_resource *rsc = fd_resource(prsc);
 
-	if (rsc->dirty || (rsc->stencil && rsc->stencil->dirty))
+	if (pending(rsc, FD_PENDING_WRITE | FD_PENDING_READ))
 		fd_context_render(pctx);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index 0634923fcb2..7549becaa1f 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -60,6 +60,15 @@ struct fd_resource_slice {
 	uint32_t size0;          /* size of first layer in slice */
 };
 
+/* status of queued up but not flushed reads and write operations.
+ * In _transfer_map() we need to know if queued up rendering needs
+ * to be flushed to preserve the order of cpu and gpu access.
+ */
+enum fd_resource_status {
+	FD_PENDING_WRITE = 0x01,
+	FD_PENDING_READ  = 0x02,
+};
+
 struct fd_resource {
 	struct u_resource base;
 	struct fd_bo *bo;
@@ -68,17 +77,23 @@ struct fd_resource {
 	uint32_t layer_size;
 	struct fd_resource_slice slices[MAX_MIP_LEVELS];
 	uint32_t timestamp;
-	bool dirty, reading;
 	/* buffer range that has been initialized */
 	struct util_range valid_buffer_range;
 
 	/* reference to the resource holding stencil data for a z32_s8 texture */
+	/* TODO rename to secondary or auxiliary? */
 	struct fd_resource *stencil;
 
+	/* pending read/write state: */
+	enum fd_resource_status status;
+	/* resources accessed by queued but not flushed draws are tracked
+	 * in the used_resources list.
+	 */
 	struct list_head list;
+	struct fd_context *pending_ctx;
 };
 
-static INLINE struct fd_resource *
+static inline struct fd_resource *
 fd_resource(struct pipe_resource *ptex)
 {
 	return (struct fd_resource *)ptex;
@@ -89,13 +104,13 @@ struct fd_transfer {
 	void *staging;
 };
 
-static INLINE struct fd_transfer *
+static inline struct fd_transfer *
 fd_transfer(struct pipe_transfer *ptrans)
 {
 	return (struct fd_transfer *)ptrans;
 }
 
-static INLINE struct fd_resource_slice *
+static inline struct fd_resource_slice *
 fd_resource_slice(struct fd_resource *rsc, unsigned level)
 {
 	assert(level <= rsc->base.b.last_level);
@@ -103,7 +118,7 @@ fd_resource_slice(struct fd_resource *rsc, unsigned level)
 }
 
 /* get offset for specified mipmap level and texture/array layer */
-static INLINE uint32_t
+static inline uint32_t
 fd_resource_offset(struct fd_resource *rsc, unsigned level, unsigned layer)
 {
 	struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b3b5462b437..b55f5b36ca9 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -68,7 +68,8 @@ static const struct debug_named_value debug_options[] = {
 		{"fraghalf",  FD_DBG_FRAGHALF, "Use half-precision in fragment shader"},
 		{"nobin",     FD_DBG_NOBIN,  "Disable hw binning"},
 		{"optmsgs",   FD_DBG_OPTMSGS,"Enable optimizer debug messages"},
-		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 120 (rather than 130) on a3xx+"},
+		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"},
+		{"shaderdb",  FD_DBG_SHADERDB, "Enable shaderdb output"},
 		DEBUG_NAMED_VALUE_END
 };
 
@@ -163,9 +164,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_CUBE_MAP_ARRAY:
-	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
-	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
-	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
 	case PIPE_CAP_START_INSTANCE:
 	case PIPE_CAP_COMPUTE:
@@ -175,10 +173,23 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_PRIMITIVE_RESTART:
 	case PIPE_CAP_TGSI_INSTANCEID:
 	case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
-		return is_a3xx(screen) || is_a4xx(screen);
-
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
 	case PIPE_CAP_INDEP_BLEND_FUNC:
+	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+		return is_a3xx(screen) || is_a4xx(screen);
+
+	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+		/* ignoring first/last_element.. but I guess that should be
+		 * easy to add..
+		 */
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+		/* I think 32k on a4xx.. and we could possibly emulate more
+		 * by pretending 2d/rect textures and splitting high bits
+		 * of index into 2nd dimension..
+		 */
+		return 16383;
+
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
 		return is_a3xx(screen);
 
@@ -188,7 +199,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
 		if (glsl120)
 			return 120;
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 130 : 120;
+		return is_ir3(screen) ? 130 : 120;
 
 	/* Unsupported features. */
 	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
@@ -218,6 +229,10 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -225,9 +240,17 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 
 	/* Stream output. */
 	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+		if (is_ir3(screen))
+			return PIPE_MAX_SO_BUFFERS;
+		return 0;
 	case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+		if (is_ir3(screen))
+			return 1;
+		return 0;
 	case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
 	case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+		if (is_ir3(screen))
+			return 16 * 4;   /* should only be shader out limit? */
 		return 0;
 
 	/* Geometry shader output, unsupported. */
@@ -258,9 +281,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_QUERY_TIMESTAMP:
 		return 0;
 	case PIPE_CAP_OCCLUSION_QUERY:
-		/* TODO still missing on a4xx, but we lie to get gl2..
-		 * it's not a feature, it's a bug!
-		 */
 		return is_a3xx(screen) || is_a4xx(screen);
 
 	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
@@ -357,7 +377,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		 */
 		return ((is_a3xx(screen) || is_a4xx(screen)) ? 4096 : 64) * sizeof(float[4]);
 	case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 16 : 1;
+		return is_ir3(screen) ? 16 : 1;
 	case PIPE_SHADER_CAP_MAX_PREDS:
 		return 0; /* nothing uses this */
 	case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
@@ -379,7 +399,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 	case PIPE_SHADER_CAP_INTEGERS:
 		if (glsl120)
 			return 0;
-		return (is_a3xx(screen) || is_a4xx(screen)) ? 1 : 0;
+		return is_ir3(screen) ? 1 : 0;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
 		return 16;
@@ -546,7 +566,6 @@ fd_screen_create(struct fd_device *dev)
 	pscreen->get_timestamp = fd_screen_get_timestamp;
 
 	pscreen->fence_reference = fd_screen_fence_ref;
-	pscreen->fence_signalled = fd_screen_fence_signalled;
 	pscreen->fence_finish = fd_screen_fence_finish;
 
 	util_format_s3tc_init();
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index dbc2808262a..4e5c3a61958 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -56,7 +56,7 @@ struct fd_screen {
 	int64_t cpu_gpu_time_delta;
 };
 
-static INLINE struct fd_screen *
+static inline struct fd_screen *
 fd_screen(struct pipe_screen *pscreen)
 {
 	return (struct fd_screen *)pscreen;
@@ -73,6 +73,7 @@ struct fd_bo * fd_screen_bo_from_handle(struct pipe_screen *pscreen,
 struct pipe_screen * fd_screen_create(struct fd_device *dev);
 
 /* is a3xx patch revision 0? */
+/* TODO a306.0 probably doesn't need this.. be more clever?? */
 static inline boolean
 is_a3xx_p0(struct fd_screen *screen)
 {
@@ -91,4 +92,11 @@ is_a4xx(struct fd_screen *screen)
 	return (screen->gpu_id >= 400) && (screen->gpu_id < 500);
 }
 
+/* is it using the ir3 compiler (shader isa introduced with a3xx)? */
+static inline boolean
+is_ir3(struct fd_screen *screen)
+{
+	return is_a3xx(screen) || is_a4xx(screen);
+}
+
 #endif /* FREEDRENO_SCREEN_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index 77aa4f21d3b..7bf8bdb4507 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -300,6 +300,67 @@ fd_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
 	ctx->dirty |= FD_DIRTY_VTXSTATE;
 }
 
+static struct pipe_stream_output_target *
+fd_create_stream_output_target(struct pipe_context *pctx,
+		struct pipe_resource *prsc, unsigned buffer_offset,
+		unsigned buffer_size)
+{
+	struct pipe_stream_output_target *target;
+
+	target = CALLOC_STRUCT(pipe_stream_output_target);
+	if (!target)
+		return NULL;
+
+	pipe_reference_init(&target->reference, 1);
+	pipe_resource_reference(&target->buffer, prsc);
+
+	target->context = pctx;
+	target->buffer_offset = buffer_offset;
+	target->buffer_size = buffer_size;
+
+	return target;
+}
+
+static void
+fd_stream_output_target_destroy(struct pipe_context *pctx,
+		struct pipe_stream_output_target *target)
+{
+	pipe_resource_reference(&target->buffer, NULL);
+	FREE(target);
+}
+
+static void
+fd_set_stream_output_targets(struct pipe_context *pctx,
+		unsigned num_targets, struct pipe_stream_output_target **targets,
+		const unsigned *offsets)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_streamout_stateobj *so = &ctx->streamout;
+	unsigned i;
+
+	debug_assert(num_targets <= ARRAY_SIZE(so->targets));
+
+	for (i = 0; i < num_targets; i++) {
+		boolean changed = targets[i] != so->targets[i];
+		boolean append = (offsets[i] == (unsigned)-1);
+
+		if (!changed && append)
+			continue;
+
+		so->offsets[i] = 0;
+
+		pipe_so_target_reference(&so->targets[i], targets[i]);
+	}
+
+	for (; i < so->num_targets; i++) {
+		pipe_so_target_reference(&so->targets[i], NULL);
+	}
+
+	so->num_targets = num_targets;
+
+	ctx->dirty |= FD_DIRTY_STREAMOUT;
+}
+
 void
 fd_state_init(struct pipe_context *pctx)
 {
@@ -328,4 +389,8 @@ fd_state_init(struct pipe_context *pctx)
 	pctx->create_vertex_elements_state = fd_vertex_state_create;
 	pctx->delete_vertex_elements_state = fd_vertex_state_delete;
 	pctx->bind_vertex_elements_state = fd_vertex_state_bind;
+
+	pctx->create_stream_output_target = fd_create_stream_output_target;
+	pctx->stream_output_target_destroy = fd_stream_output_target_destroy;
+	pctx->set_stream_output_targets = fd_set_stream_output_targets;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_surface.c b/src/gallium/drivers/freedreno/freedreno_surface.c
index 250fe4bc0f5..70c44eb79c3 100644
--- a/src/gallium/drivers/freedreno/freedreno_surface.c
+++ b/src/gallium/drivers/freedreno/freedreno_surface.c
@@ -41,7 +41,8 @@ fd_create_surface(struct pipe_context *pctx,
 //	struct fd_resource* tex = fd_resource(ptex);
 	struct fd_surface* surface = CALLOC_STRUCT(fd_surface);
 
-	assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
+	debug_assert(ptex->target != PIPE_BUFFER);
+	debug_assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
 
 	if (surface) {
 		struct pipe_surface *psurf = &surface->base;
diff --git a/src/gallium/drivers/freedreno/freedreno_surface.h b/src/gallium/drivers/freedreno/freedreno_surface.h
index 3293f33dd84..2de37cee2dd 100644
--- a/src/gallium/drivers/freedreno/freedreno_surface.h
+++ b/src/gallium/drivers/freedreno/freedreno_surface.h
@@ -40,7 +40,7 @@ struct fd_surface {
 	uint16_t depth;
 };
 
-static INLINE struct fd_surface *
+static inline struct fd_surface *
 fd_surface(struct pipe_surface *psurf)
 {
 	return (struct fd_surface *)psurf;
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index deb0e602ce2..7129a1bddd1 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -40,6 +40,7 @@
 #include "util/u_dynarray.h"
 #include "util/u_pack_color.h"
 
+#include "disasm.h"
 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
 
@@ -53,6 +54,12 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 /* TBD if it is same on a2xx, but for now: */
 #define MAX_MIP_LEVELS A3XX_MAX_MIP_LEVELS
 
+#define A2XX_MAX_RENDER_TARGETS 1
+#define A3XX_MAX_RENDER_TARGETS 4
+#define A4XX_MAX_RENDER_TARGETS 8
+
+#define MAX_RENDER_TARGETS A4XX_MAX_RENDER_TARGETS
+
 #define FD_DBG_MSGS     0x0001
 #define FD_DBG_DISASM   0x0002
 #define FD_DBG_DCLEAR   0x0004
@@ -64,6 +71,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_NOBIN    0x0100
 #define FD_DBG_OPTMSGS  0x0200
 #define FD_DBG_GLSL120  0x0400
+#define FD_DBG_SHADERDB 0x0800
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
@@ -108,6 +116,58 @@ pipe_surface_format(struct pipe_surface *psurf)
 	return psurf->format;
 }
 
+static inline bool
+fd_surface_half_precision(const struct pipe_surface *psurf)
+{
+	enum pipe_format format;
+
+	if (!psurf)
+		return true;
+
+	format = psurf->format;
+
+	/* colors are provided in consts, which go through cov.f32f16, which will
+	 * break these values
+	 */
+	if (util_format_is_pure_integer(format))
+		return false;
+
+	/* avoid losing precision on 32-bit float formats */
+	if (util_format_is_float(format) &&
+		util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32)
+		return false;
+
+	return true;
+}
+
+static inline unsigned
+fd_sampler_first_level(const struct pipe_sampler_view *view)
+{
+	if (view->target == PIPE_BUFFER)
+		return 0;
+	return view->u.tex.first_level;
+}
+
+static inline unsigned
+fd_sampler_last_level(const struct pipe_sampler_view *view)
+{
+	if (view->target == PIPE_BUFFER)
+		return 0;
+	return view->u.tex.last_level;
+}
+
+static inline bool
+fd_half_precision(struct pipe_framebuffer_state *pfb)
+{
+	unsigned i;
+
+	for (i = 0; i < pfb->nr_cbufs; i++)
+		if (!fd_surface_half_precision(pfb->cbufs[i]))
+			return false;
+
+	return true;
+}
+
 #define LOG_DWORDS 0
 
 static inline void emit_marker(struct fd_ringbuffer *ring, int scratch_idx);
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 48ae7c71b9f..83ed5ffdca0 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -103,7 +103,7 @@ static void print_reg(reg_t reg, bool full, bool r, bool c, bool im,
 	} else if ((reg.num == REG_P0) && !c) {
 		printf("p0.%c", component[reg.comp]);
 	} else {
-		printf("%s%c%d.%c", full ? "" : "h", type, reg.num, component[reg.comp]);
+		printf("%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
 	}
 }
 
@@ -122,6 +122,32 @@ static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im,
 	print_reg(reg, full, r, c, im, neg, abs, addr_rel);
 }
 
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+	reg_t reg;
+	bool full;
+	bool r;
+	bool c;
+	bool im;
+	bool neg;
+	bool abs;
+	bool addr_rel;
+};
+
+static void print_src(struct reginfo *info)
+{
+	print_reg_src(info->reg, info->full, info->r, info->c, info->im,
+			info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct reginfo *info)
+//{
+//	print_reg_dst(info->reg, info->full, info->addr_rel);
+//}
+
 static void print_instr_cat0(instr_t *instr)
 {
 	instr_cat0_t *cat0 = &instr->cat0;
@@ -454,10 +480,70 @@ static void print_instr_cat6(instr_t *instr)
 {
 	instr_cat6_t *cat6 = &instr->cat6;
 	char sd = 0, ss = 0;  /* dst/src address space */
-	bool full = type_size(cat6->type) == 32;
 	bool nodst = false;
+	struct reginfo dst, src1, src2;
+	int src1off = 0, dstoff = 0;
 
-	printf(".%s ", type[cat6->type]);
+	memset(&dst, 0, sizeof(dst));
+	memset(&src1, 0, sizeof(src1));
+	memset(&src2, 0, sizeof(src2));
+
+	switch (cat6->opc) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	case OPC_L2G:
+	case OPC_G2L:
+		dst.full = true;
+		src1.full = true;
+		src2.full = true;
+		break;
+	case OPC_STG:
+	case OPC_STL:
+	case OPC_STP:
+	case OPC_STI:
+	case OPC_STLW:
+	case OPC_STGB_4D_4:
+	case OPC_STIB:
+		dst.full  = true;
+		src1.full = type_size(cat6->type) == 32;
+		src2.full = type_size(cat6->type) == 32;
+		break;
+	default:
+		dst.full  = type_size(cat6->type) == 32;
+		src1.full = true;
+		src2.full = true;
+		break;
+	}
+
+	switch (cat6->opc) {
+	case OPC_PREFETCH:
+	case OPC_RESINFO:
+		break;
+	case OPC_ATOMIC_ADD:
+	case OPC_ATOMIC_SUB:
+	case OPC_ATOMIC_XCHG:
+	case OPC_ATOMIC_INC:
+	case OPC_ATOMIC_DEC:
+	case OPC_ATOMIC_CMPXCHG:
+	case OPC_ATOMIC_MIN:
+	case OPC_ATOMIC_MAX:
+	case OPC_ATOMIC_AND:
+	case OPC_ATOMIC_OR:
+	case OPC_ATOMIC_XOR:
+		ss = cat6->g ? 'g' : 'l';
+		printf(".%c", ss);
+		printf(".%s", type[cat6->type]);
+		break;
+	default:
+		dst.im = cat6->g && !cat6->dst_off;
+		printf(".%s", type[cat6->type]);
+		break;
+	}
+	printf(" ");
 
 	switch (cat6->opc) {
 	case OPC_STG:
@@ -499,68 +585,65 @@ static void print_instr_cat6(instr_t *instr)
 		break;
 
 	case OPC_STI:
-		full = false;  // XXX or inverts??
+		dst.full = false;  // XXX or inverts??
 		break;
 	}
 
-	if (cat6->has_off) {
-		if (!nodst) {
-			if (sd)
-				printf("%c[", sd);
-			print_reg_dst((reg_t)(cat6->a.dst), full, false);
-			if (sd)
-				printf("]");
-			printf(", ");
-		}
-		if (ss)
-			printf("%c[", ss);
-		print_reg_src((reg_t)(cat6->a.src1), true,
-				false, false, cat6->a.src1_im, false, false, false);
-		if (cat6->a.off)
-			printf("%+d", cat6->a.off);
-		if (ss)
-			printf("]");
-		printf(", ");
-		print_reg_src((reg_t)(cat6->a.src2), full,
-				false, false, cat6->a.src2_im, false, false, false);
+	if (cat6->dst_off) {
+		dst.reg = (reg_t)(cat6->c.dst);
+		dstoff  = cat6->c.off;
 	} else {
-		if (!nodst) {
-			if (sd)
-				printf("%c[", sd);
-			print_reg_dst((reg_t)(cat6->b.dst), full, false);
-			if (sd)
-				printf("]");
-			printf(", ");
-		}
-		if (ss)
-			printf("%c[", ss);
-		print_reg_src((reg_t)(cat6->b.src1), true,
-				false, false, cat6->b.src1_im, false, false, false);
-		if (ss)
-			printf("]");
-		printf(", ");
-		print_reg_src((reg_t)(cat6->b.src2), full,
-				false, false, cat6->b.src2_im, false, false, false);
+		dst.reg = (reg_t)(cat6->d.dst);
 	}
 
-	if (debug & PRINT_VERBOSE) {
-		switch (cat6->opc) {
-		case OPC_LDG:
-		case OPC_LDP:
-			/* load instructions: */
-			if (cat6->a.dummy2|cat6->a.dummy3)
-				printf("\t{6: %x,%x}", cat6->a.dummy2, cat6->a.dummy3);
-			break;
-		case OPC_STG:
-		case OPC_STP:
-		case OPC_STI:
-			/* store instructions: */
-			if (cat6->b.dummy2|cat6->b.dummy2)
-				printf("\t{6: %x,%x}", cat6->b.dummy2, cat6->b.dummy3);
-			if (cat6->b.ignore0)
-				printf("\t{?? %x}", cat6->b.ignore0);
-			break;
-		}
+	if (cat6->src_off) {
+		src1.reg = (reg_t)(cat6->a.src1);
+		src1.im  = cat6->a.src1_im;
+		src2.reg = (reg_t)(cat6->a.src2);
+		src2.im  = cat6->a.src2_im;
+		src1off  = cat6->a.off;
+	} else {
+		src1.reg = (reg_t)(cat6->b.src1);
+		src1.im  = cat6->b.src1_im;
+		src2.reg = (reg_t)(cat6->b.src2);
+		src2.im  = cat6->b.src2_im;
+	}
+
+	if (!nodst) {
+		if (sd)
+			printf("%c[", sd);
+		/* note: dst might actually be a src (ie. address to store to) */
+		print_src(&dst);
+		if (dstoff)
+			printf("%+d", dstoff);
+		if (sd)
+			printf("]");
+		printf(", ");
+	}
+
+	if (ss)
+		printf("%c[", ss);
+
+	/* can have a larger than normal immed, so hack: */
+	if (src1.im) {
+		printf("%u", src1.reg.dummy13);
+	} else {
+		print_src(&src1);
+	}
+
+	if (src1off)
+		printf("%+d", src1off);
+	if (ss)
+		printf("]");
+
+	switch (cat6->opc) {
+	case OPC_RESINFO:
+	case OPC_RESFMT:
+		break;
+	default:
+		printf(", ");
+		print_src(&src2);
+		break;
 	}
 }
 
@@ -711,19 +794,19 @@ struct opc_info {
 	OPC(6, OPC_LDLW,         ldlw),
 	OPC(6, OPC_STLW,         stlw),
 	OPC(6, OPC_RESFMT,       resfmt),
-	OPC(6, OPC_RESINFO,      resinf),
-	OPC(6, OPC_ATOMIC_ADD_L,     atomic.add.l),
-	OPC(6, OPC_ATOMIC_SUB_L,     atomic.sub.l),
-	OPC(6, OPC_ATOMIC_XCHG_L,    atomic.xchg.l),
-	OPC(6, OPC_ATOMIC_INC_L,     atomic.inc.l),
-	OPC(6, OPC_ATOMIC_DEC_L,     atomic.dec.l),
-	OPC(6, OPC_ATOMIC_CMPXCHG_L, atomic.cmpxchg.l),
-	OPC(6, OPC_ATOMIC_MIN_L,     atomic.min.l),
-	OPC(6, OPC_ATOMIC_MAX_L,     atomic.max.l),
-	OPC(6, OPC_ATOMIC_AND_L,     atomic.and.l),
-	OPC(6, OPC_ATOMIC_OR_L,      atomic.or.l),
-	OPC(6, OPC_ATOMIC_XOR_L,     atomic.xor.l),
-	OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.4d),
+	OPC(6, OPC_RESINFO,      resinfo),
+	OPC(6, OPC_ATOMIC_ADD,     atomic.add),
+	OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
+	OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
+	OPC(6, OPC_ATOMIC_INC,     atomic.inc),
+	OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
+	OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+	OPC(6, OPC_ATOMIC_MIN,     atomic.min),
+	OPC(6, OPC_ATOMIC_MAX,     atomic.max),
+	OPC(6, OPC_ATOMIC_AND,     atomic.and),
+	OPC(6, OPC_ATOMIC_OR,      atomic.or),
+	OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
+	OPC(6, OPC_LDGB_TYPED_4D,    ldgb.typed.3d),
 	OPC(6, OPC_STGB_4D_4,    stgb.4d.4),
 	OPC(6, OPC_STIB,         stib),
 	OPC(6, OPC_LDC_4,        ldc.4),
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index efb07ea479e..c3fb68d511c 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -173,17 +173,17 @@ typedef enum {
 	OPC_STLW = 11,
 	OPC_RESFMT = 14,
 	OPC_RESINFO = 15,
-	OPC_ATOMIC_ADD_L = 16,
-	OPC_ATOMIC_SUB_L = 17,
-	OPC_ATOMIC_XCHG_L = 18,
-	OPC_ATOMIC_INC_L = 19,
-	OPC_ATOMIC_DEC_L = 20,
-	OPC_ATOMIC_CMPXCHG_L = 21,
-	OPC_ATOMIC_MIN_L = 22,
-	OPC_ATOMIC_MAX_L = 23,
-	OPC_ATOMIC_AND_L = 24,
-	OPC_ATOMIC_OR_L = 25,
-	OPC_ATOMIC_XOR_L = 26,
+	OPC_ATOMIC_ADD = 16,
+	OPC_ATOMIC_SUB = 17,
+	OPC_ATOMIC_XCHG = 18,
+	OPC_ATOMIC_INC = 19,
+	OPC_ATOMIC_DEC = 20,
+	OPC_ATOMIC_CMPXCHG = 21,
+	OPC_ATOMIC_MIN = 22,
+	OPC_ATOMIC_MAX = 23,
+	OPC_ATOMIC_AND = 24,
+	OPC_ATOMIC_OR = 25,
+	OPC_ATOMIC_XOR = 26,
 	OPC_LDGB_TYPED_4D = 27,
 	OPC_STGB_4D_4 = 28,
 	OPC_STIB = 29,
@@ -575,7 +575,7 @@ typedef struct PACKED {
 	uint32_t opc_cat  : 3;
 } instr_cat5_t;
 
-/* [src1 + off], src2: */
+/* dword0 encoding for src_off: [src1 + off], src2: */
 typedef struct PACKED {
 	/* dword0: */
 	uint32_t mustbe1  : 1;
@@ -586,37 +586,50 @@ typedef struct PACKED {
 	uint32_t src2     : 8;
 
 	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t dummy2   : 9;
-	uint32_t type     : 3;
-	uint32_t dummy3   : 2;
-	uint32_t opc      : 5;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
+	uint32_t dword1;
 } instr_cat6a_t;
 
-/* [src1], src2: */
+/* dword0 encoding for !src_off: [src1], src2 */
 typedef struct PACKED {
 	/* dword0: */
 	uint32_t mustbe0  : 1;
-	uint32_t src1     : 8;
-	uint32_t ignore0  : 13;
+	uint32_t src1     : 13;
+	uint32_t ignore0  : 8;
 	uint32_t src1_im  : 1;
 	uint32_t src2_im  : 1;
 	uint32_t src2     : 8;
 
 	/* dword1: */
-	uint32_t dst      : 8;
-	uint32_t dummy2   : 9;
-	uint32_t type     : 3;
-	uint32_t dummy3   : 2;
-	uint32_t opc      : 5;
-	uint32_t jmp_tgt  : 1;
-	uint32_t sync     : 1;
-	uint32_t opc_cat  : 3;
+	uint32_t dword1;
 } instr_cat6b_t;
 
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	/* note: there is some weird stuff going on where sometimes
+	 * cat6->a.off is involved.. but that seems like a bug in
+	 * the blob, since it is used even if !cat6->src_off
+	 * It would make sense for there to be some more bits to
+	 * bring us to 11 bits worth of offset, but not sure..
+	 */
+	int32_t off       : 8;
+	uint32_t mustbe1  : 1;
+	uint32_t dst      : 8;
+	uint32_t pad1     : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+	/* dword0: */
+	uint32_t dword0;
+
+	uint32_t dst      : 8;
+	uint32_t mustbe0  : 1;
+	uint32_t pad0     : 23;
+} instr_cat6d_t;
+
 /* I think some of the other cat6 instructions use additional
  * sub-encodings..
  */
@@ -624,16 +637,20 @@ typedef struct PACKED {
 typedef union PACKED {
 	instr_cat6a_t a;
 	instr_cat6b_t b;
+	instr_cat6c_t c;
+	instr_cat6d_t d;
 	struct PACKED {
 		/* dword0: */
-		uint32_t has_off  : 1;
+		uint32_t src_off  : 1;
 		uint32_t pad1     : 31;
 
 		/* dword1: */
-		uint32_t dst      : 8;
-		uint32_t dummy2   : 9;
+		uint32_t pad2     : 8;
+		uint32_t dst_off  : 1;
+		uint32_t pad3     : 8;
 		uint32_t type     : 3;
-		uint32_t dummy3   : 2;
+		uint32_t g        : 1;  /* or in some cases it means dst immed */
+		uint32_t pad4     : 1;
 		uint32_t opc      : 5;
 		uint32_t jmp_tgt  : 1;
 		uint32_t sync     : 1;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index a166b67d7cf..b24825cff85 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -499,32 +499,51 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr,
 static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 		struct ir3_info *info)
 {
-	struct ir3_register *dst  = instr->regs[0];
-	struct ir3_register *src1 = instr->regs[1];
-	struct ir3_register *src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+	struct ir3_register *dst, *src1, *src2;
 	instr_cat6_t *cat6 = ptr;
 
-	iassert(instr->regs_count >= 2);
+	/* the "dst" for a store instruction is (from the perspective
+	 * of data flow in the shader, ie. register use/def, etc) in
+	 * fact a register that is read by the instruction, rather
+	 * than written:
+	 */
+	if (is_store(instr)) {
+		iassert(instr->regs_count >= 3);
 
-	if (instr->cat6.offset || instr->opc == OPC_LDG) {
+		dst  = instr->regs[1];
+		src1 = instr->regs[2];
+		src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+	} else {
+		iassert(instr->regs_count >= 2);
+
+		dst  = instr->regs[0];
+		src1 = instr->regs[1];
+		src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+	}
+
+
+	/* TODO we need a more comprehensive list about which instructions
+	 * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
+	 * indicate to use the src_off encoding even if offset is zero
+	 * (but then what to do about dst_off?)
+	 */
+	if (instr->cat6.src_offset || (instr->opc == OPC_LDG)) {
 		instr_cat6a_t *cat6a = ptr;
 
-		cat6->has_off = true;
+		cat6->src_off = true;
 
-		cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
 		cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
 		cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
 		if (src2) {
 			cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
 			cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
 		}
-		cat6a->off = instr->cat6.offset;
+		cat6a->off = instr->cat6.src_offset;
 	} else {
 		instr_cat6b_t *cat6b = ptr;
 
-		cat6->has_off = false;
+		cat6->src_off = false;
 
-		cat6b->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
 		cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
 		cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
 		if (src2) {
@@ -533,10 +552,22 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 		}
 	}
 
+	if (instr->cat6.dst_offset || (instr->opc == OPC_STG)) {
+		instr_cat6c_t *cat6c = ptr;
+		cat6->dst_off = true;
+		cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+		cat6c->off = instr->cat6.dst_offset;
+	} else {
+		instr_cat6d_t *cat6d = ptr;
+		cat6->dst_off = false;
+		cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+	}
+
 	cat6->type     = instr->cat6.type;
 	cat6->opc      = instr->opc;
 	cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
 	cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+	cat6->g        = !!(instr->flags & IR3_INSTR_G);
 	cat6->opc_cat  = 6;
 
 	return 0;
@@ -669,7 +700,6 @@ struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
 	return ir3_instr_create2(block, category, opc, 4);
 }
 
-/* only used by old compiler: */
 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 {
 	struct ir3_instruction *new_instr = instr_create(instr->block,
@@ -707,6 +737,17 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 	return reg;
 }
 
+void
+ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr)
+{
+	if (instr->address != addr) {
+		struct ir3 *ir = instr->block->shader;
+		instr->address = addr;
+		array_insert(ir->indirects, instr);
+	}
+}
+
 void
 ir3_block_clear_mark(struct ir3_block *block)
 {
@@ -723,15 +764,16 @@ ir3_clear_mark(struct ir3 *ir)
 }
 
 /* note: this will destroy instr->depth, don't do it until after sched! */
-void
+unsigned
 ir3_count_instructions(struct ir3 *ir)
 {
-	unsigned ip = 0;
+	unsigned cnt = 0;
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-			instr->ip = ip++;
+			instr->ip = cnt++;
 		}
 		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
 		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
 	}
+	return cnt;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 9c35a763d58..12f2ebe18db 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -172,6 +172,7 @@ struct ir3_instruction {
 		IR3_INSTR_P     = 0x080,
 		IR3_INSTR_S     = 0x100,
 		IR3_INSTR_S2EN  = 0x200,
+		IR3_INSTR_G     = 0x400,
 		/* meta-flags, for intermediate stages of IR, ie.
 		 * before register assignment is done:
 		 */
@@ -209,7 +210,8 @@ struct ir3_instruction {
 		} cat5;
 		struct {
 			type_t type;
-			int offset;
+			int src_offset;
+			int dst_offset;
 			int iim_val;
 		} cat6;
 		/* for meta-instructions, just used to hold extra data
@@ -285,6 +287,8 @@ struct ir3_instruction {
 
 	/* an instruction can reference at most one address register amongst
 	 * it's src/dst registers.  Beyond that, you need to insert mov's.
+	 *
+	 * NOTE: do not write this directly, use ir3_instr_set_address()
 	 */
 	struct ir3_instruction *address;
 
@@ -365,6 +369,12 @@ struct ir3 {
 	unsigned predicates_count, predicates_sz;
 	struct ir3_instruction **predicates;
 
+	/* Track instructions which do not write a register but other-
+	 * wise must not be discarded (such as kill, stg, etc)
+	 */
+	unsigned keeps_count, keeps_sz;
+	struct ir3_instruction **keeps;
+
 	/* List of blocks: */
 	struct list_head block_list;
 
@@ -420,6 +430,9 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
 
+void ir3_instr_set_address(struct ir3_instruction *instr,
+		struct ir3_instruction *addr);
+
 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 {
 	if (instr->flags & IR3_INSTR_MARK)
@@ -431,7 +444,7 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 void ir3_block_clear_mark(struct ir3_block *block);
 void ir3_clear_mark(struct ir3 *shader);
 
-void ir3_count_instructions(struct ir3 *ir);
+unsigned ir3_count_instructions(struct ir3 *ir);
 
 static inline int ir3_instr_regno(struct ir3_instruction *instr,
 		struct ir3_register *reg)
@@ -547,6 +560,26 @@ is_store(struct ir3_instruction *instr)
 	return false;
 }
 
+static inline bool is_load(struct ir3_instruction *instr)
+{
+	if (is_mem(instr)) {
+		switch (instr->opc) {
+		case OPC_LDG:
+		case OPC_LDL:
+		case OPC_LDP:
+		case OPC_L2G:
+		case OPC_LDLW:
+		case OPC_LDC_4:
+		case OPC_LDLV:
+		/* probably some others too.. */
+			return true;
+		default:
+			break;
+		}
+	}
+	return false;
+}
+
 static inline bool is_input(struct ir3_instruction *instr)
 {
 	/* in some cases, ldlv is used to fetch varying without
@@ -1036,6 +1069,7 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
 /* cat6 instructions: */
 INSTR2(6, LDLV)
 INSTR2(6, LDG)
+INSTR3(6, STG)
 
 /* ************************************************************************* */
 /* split this out or find some helper to use.. like main/bitset.h.. */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index ad9d2719d59..ede29f445dc 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -43,127 +43,15 @@
 #include "instr-a3xx.h"
 #include "ir3.h"
 
-static void dump_reg(const char *name, uint32_t r)
-{
-	if (r != regid(63,0))
-		debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
-}
-
-static void dump_semantic(struct ir3_shader_variant *so,
-		unsigned sem, const char *name)
-{
-	uint32_t regid;
-	regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0));
-	dump_reg(name, regid);
-}
-
 static void dump_info(struct ir3_shader_variant *so, const char *str)
 {
 	uint32_t *bin;
-	const char *type = (so->type == SHADER_VERTEX) ? "VERT" : "FRAG";
-
-	// for debug, dump some before/after info:
+	const char *type = ir3_shader_stage(so->shader);
 	// TODO make gpu_id configurable on cmdline
 	bin = ir3_shader_assemble(so, 320);
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		struct ir3 *ir = so->ir;
-		struct ir3_register *reg;
-		uint8_t regid;
-		unsigned i;
-
-		debug_printf("; %s: %s\n", type, str);
-
-		for (i = 0; i < ir->ninputs; i++) {
-			if (!ir->inputs[i]) {
-				debug_printf("; in%d unused\n", i);
-				continue;
-			}
-			reg = ir->inputs[i]->regs[0];
-			regid = reg->num;
-			debug_printf("@in(%sr%d.%c)\tin%d\n",
-					(reg->flags & IR3_REG_HALF) ? "h" : "",
-					(regid >> 2), "xyzw"[regid & 0x3], i);
-		}
-
-		for (i = 0; i < ir->noutputs; i++) {
-			if (!ir->outputs[i]) {
-				debug_printf("; out%d unused\n", i);
-				continue;
-			}
-			/* kill shows up as a virtual output.. skip it! */
-			if (is_kill(ir->outputs[i]))
-				continue;
-			reg = ir->outputs[i]->regs[0];
-			regid = reg->num;
-			debug_printf("@out(%sr%d.%c)\tout%d\n",
-					(reg->flags & IR3_REG_HALF) ? "h" : "",
-					(regid >> 2), "xyzw"[regid & 0x3], i);
-		}
-
-		for (i = 0; i < so->immediates_count; i++) {
-			debug_printf("@const(c%d.x)\t", so->first_immediate + i);
-			debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
-					so->immediates[i].val[0],
-					so->immediates[i].val[1],
-					so->immediates[i].val[2],
-					so->immediates[i].val[3]);
-		}
-
-		disasm_a3xx(bin, so->info.sizedwords, 0, so->type);
-
-		debug_printf("; %s: outputs:", type);
-		for (i = 0; i < so->outputs_count; i++) {
-			uint8_t regid = so->outputs[i].regid;
-			ir3_semantic sem = so->outputs[i].semantic;
-			debug_printf(" r%d.%c (%u:%u)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					sem2name(sem), sem2idx(sem));
-		}
-		debug_printf("\n");
-		debug_printf("; %s: inputs:", type);
-		for (i = 0; i < so->inputs_count; i++) {
-			uint8_t regid = so->inputs[i].regid;
-			ir3_semantic sem = so->inputs[i].semantic;
-			debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)",
-					(regid >> 2), "xyzw"[regid & 0x3],
-					sem2name(sem), sem2idx(sem),
-					so->inputs[i].compmask,
-					so->inputs[i].inloc,
-					so->inputs[i].bary);
-		}
-		debug_printf("\n");
-	}
-
-	/* print generic shader info: */
-	debug_printf("; %s: %u instructions, %d half, %d full\n", type,
-			so->info.instrs_count,
-			so->info.max_half_reg + 1,
-			so->info.max_reg + 1);
-
-	/* print shader type specific info: */
-	switch (so->type) {
-	case SHADER_VERTEX:
-		dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos");
-		dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize");
-		break;
-	case SHADER_FRAGMENT:
-		dump_reg("pos (bary)", so->pos_regid);
-		dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz");
-		dump_semantic(so, TGSI_SEMANTIC_COLOR, "color");
-		/* these two are hard-coded since we don't know how to
-		 * program them to anything but all 0's...
-		 */
-		if (so->frag_coord)
-			debug_printf("; fragcoord: r0.x\n");
-		if (so->frag_face)
-			debug_printf("; fragface: hr0.x\n");
-		break;
-	case SHADER_COMPUTE:
-		break;
-	}
+	debug_printf("; %s: %s\n", type, str);
+	ir3_shader_disasm(so, bin);
 	free(bin);
-
-	debug_printf("\n");
 }
 
 
@@ -205,8 +93,7 @@ static void print_usage(void)
 	printf("    --saturate-s MASK - bitmask of samplers to saturate S coord\n");
 	printf("    --saturate-t MASK - bitmask of samplers to saturate T coord\n");
 	printf("    --saturate-r MASK - bitmask of samplers to saturate R coord\n");
-	printf("    --nocp            - disable copy propagation\n");
-	printf("    --nir             - use NIR compiler\n");
+	printf("    --stream-out      - enable stream-out (aka transform feedback)\n");
 	printf("    --help            - show this message\n");
 }
 
@@ -218,6 +105,7 @@ int main(int argc, char **argv)
 	struct tgsi_parse_context parse;
 	struct ir3_compiler *compiler;
 	struct ir3_shader_variant v;
+	struct ir3_shader s;
 	struct ir3_shader_key key = {};
 	const char *info;
 	void *ptr;
@@ -225,6 +113,9 @@ int main(int argc, char **argv)
 
 	fd_mesa_debug |= FD_DBG_DISASM;
 
+	memset(&s, 0, sizeof(s));
+	memset(&v, 0, sizeof(v));
+
 	/* cmdline args which impact shader variant get spit out in a
 	 * comment on the first line..  a quick/dirty way to preserve
 	 * that info so when ir3test recompiles the shader with a new
@@ -281,6 +172,24 @@ int main(int argc, char **argv)
 			continue;
 		}
 
+		if (!strcmp(argv[n], "--stream-out")) {
+			struct pipe_stream_output_info *so = &s.stream_output;
+			debug_printf(" %s", argv[n]);
+			/* TODO more dynamic config based on number of outputs, etc
+			 * rather than just hard-code for first output:
+			 */
+			so->num_outputs = 1;
+			so->stride[0] = 4;
+			so->output[0].register_index = 0;
+			so->output[0].start_component = 0;
+			so->output[0].num_components = 4;
+			so->output[0].output_buffer = 0;
+			so->output[0].dst_offset = 2;
+			so->output[0].stream = 0;
+			n++;
+			continue;
+		}
+
 		if (!strcmp(argv[n], "--help")) {
 			print_usage();
 			return 0;
@@ -292,9 +201,6 @@ int main(int argc, char **argv)
 
 	filename = argv[n];
 
-	memset(&v, 0, sizeof(v));
-	v.key = key;
-
 	ret = read_file(filename, &ptr, &size);
 	if (ret) {
 		print_usage();
@@ -307,16 +213,21 @@ int main(int argc, char **argv)
 	if (!tgsi_text_translate(ptr, toks, Elements(toks)))
 		errx(1, "could not parse `%s'", filename);
 
+	s.tokens = toks;
+
+	v.key = key;
+	v.shader = &s;
+
 	tgsi_parse_init(&parse, toks);
 	switch (parse.FullHeader.Processor.Processor) {
 	case TGSI_PROCESSOR_FRAGMENT:
-		v.type = SHADER_FRAGMENT;
+		s.type = v.type = SHADER_FRAGMENT;
 		break;
 	case TGSI_PROCESSOR_VERTEX:
-		v.type = SHADER_VERTEX;
+		s.type = v.type = SHADER_VERTEX;
 		break;
 	case TGSI_PROCESSOR_COMPUTE:
-		v.type = SHADER_COMPUTE;
+		s.type = v.type = SHADER_COMPUTE;
 		break;
 	}
 
@@ -324,7 +235,7 @@ int main(int argc, char **argv)
 	compiler = ir3_compiler_create(320);
 
 	info = "NIR compiler";
-	ret = ir3_compile_shader_nir(compiler, &v, toks, key);
+	ret = ir3_compile_shader_nir(compiler, &v);
 	if (ret) {
 		fprintf(stderr, "compiler failed!\n");
 		return ret;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
index 86b1161d9cb..697afeba61a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -36,14 +36,13 @@ struct ir3_ra_reg_set;
 struct ir3_compiler {
 	uint32_t gpu_id;
 	struct ir3_ra_reg_set *set;
+	uint32_t shader_count;
 };
 
 struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id);
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
 
 int ir3_compile_shader_nir(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens,
-		struct ir3_shader_key key);
+		struct ir3_shader_variant *so);
 
 #endif /* IR3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 48b1d8f3606..0ab33455ed1 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -117,10 +117,6 @@ struct ir3_compile {
 	/* for looking up which system value is which */
 	unsigned sysval_semantics[8];
 
-	/* list of kill instructions: */
-	struct ir3_instruction *kill[16];
-	unsigned int kill_count;
-
 	/* set if we encounter something we can't handle yet, so we
 	 * can bail cleanly and fallback to TGSI compiler f/e
 	 */
@@ -153,6 +149,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 	nir_opt_global_to_local(s);
 	nir_convert_to_ssa(s);
 	nir_lower_idiv(s);
+	nir_lower_load_const_to_scalar(s);
 
 	do {
 		progress = false;
@@ -261,13 +258,29 @@ compile_init(struct ir3_compiler *compiler,
 
 	so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
 
-	/* one (vec4) slot for vertex id base: */
-	if (so->type == SHADER_VERTEX)
-		so->first_immediate++;
+	/* Layout of constant registers:
+	 *
+	 *    num_uniform * vec4  -  user consts
+	 *    4 * vec4            -  UBO addresses
+	 *    if (vertex shader) {
+	 *        1 * vec4        -  driver params (IR3_DP_*)
+	 *        1 * vec4        -  stream-out addresses
+	 *    }
+	 *
+	 * TODO this could be made more dynamic, to at least skip sections
+	 * that we don't need..
+	 */
 
 	/* reserve 4 (vec4) slots for ubo base addresses: */
 	so->first_immediate += 4;
 
+	if (so->type == SHADER_VERTEX) {
+		/* one (vec4) slot for driver params (see ir3_driver_param): */
+		so->first_immediate++;
+		/* one (vec4) slot for stream-output base addresses: */
+		so->first_immediate++;
+	}
+
 	return ctx;
 }
 
@@ -637,9 +650,8 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
 	ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
-	mov->address = address;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
@@ -677,9 +689,8 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	src->instr = collect;
 	src->size  = arrsz;
 	src->offset = n;
-	mov->address = address;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
@@ -700,25 +711,21 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	dst->size  = arrsz;
 	dst->offset = n;
 	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-	mov->address = address;
 	mov->fanin = collect;
 
-	array_insert(ctx->ir->indirects, mov);
+	ir3_instr_set_address(mov, address);
 
 	return mov;
 }
 
 static struct ir3_instruction *
-create_input(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned n)
+create_input(struct ir3_block *block, unsigned n)
 {
 	struct ir3_instruction *in;
 
 	in = ir3_instr_create(block, -1, OPC_META_INPUT);
 	in->inout.block = block;
 	ir3_reg_create(in, n, 0);
-	if (instr)
-		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
 
 	return in;
 }
@@ -750,7 +757,7 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp)
 
 	compile_assert(ctx, !ctx->frag_coord[comp]);
 
-	ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0);
+	ctx->frag_coord[comp] = create_input(ctx->block, 0);
 
 	switch (comp) {
 	case 0: /* .x */
@@ -789,7 +796,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
 	case 0: /* .x */
 		compile_assert(ctx, !ctx->frag_face);
 
-		ctx->frag_face = create_input(block, NULL, 0);
+		ctx->frag_face = create_input(block, 0);
 		ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 
 		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
@@ -817,6 +824,14 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
 	}
 }
 
+static struct ir3_instruction *
+create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp)
+{
+	/* first four vec4 sysval's reserved for UBOs: */
+	unsigned r = regid(ctx->so->first_driver_param + 4, dp);
+	return create_uniform(ctx, r);
+}
+
 /* helper for instructions that produce multiple consecutive scalar
  * outputs which need to have a split/fanout meta instruction inserted
  */
@@ -1218,7 +1233,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 		struct ir3_instruction *load =
 				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
 		load->cat6.type = TYPE_U32;
-		load->cat6.offset = off + i * 4;    /* byte offset */
+		load->cat6.src_offset = off + i * 4;     /* byte offset */
 		dst[i] = load;
 	}
 }
@@ -1307,7 +1322,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			 * store_output_indirect? or move this into
 			 * create_indirect_store()?
 			 */
-			for (int j = i; j < arr->length; j += 4) {
+			for (int j = i; j < arr->length; j += intr->num_components) {
 				struct ir3_instruction *split;
 
 				split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
@@ -1318,6 +1333,13 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 				arr->arr[j] = split;
 			}
 		}
+		/* fixup fanout/split neighbors: */
+		for (int i = 0; i < arr->length; i++) {
+			arr->arr[i]->cp.right = (i < (arr->length - 1)) ?
+					arr->arr[i+1] : NULL;
+			arr->arr[i]->cp.left = (i > 0) ?
+					arr->arr[i-1] : NULL;
+		}
 		break;
 	}
 	default:
@@ -1372,6 +1394,11 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			dst[i] = create_uniform_indirect(ctx, n,
 					get_addr(ctx, src[0]));
 		}
+		/* NOTE: if relative addressing is used, we set constlen in
+		 * the compiler (to worst-case value) since we don't know in
+		 * the assembler what the max addr reg value can be:
+		 */
+		ctx->so->constlen = ctx->s->num_uniforms;
 		break;
 	case nir_intrinsic_load_ubo:
 	case nir_intrinsic_load_ubo_indirect:
@@ -1409,9 +1436,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_base_vertex:
 		if (!ctx->basevertex) {
-			/* first four vec4 sysval's reserved for UBOs: */
-			unsigned r = regid(ctx->so->first_driver_param + 4, 0);
-			ctx->basevertex = create_uniform(ctx, r);
+			ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
 			add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX,
 					ctx->basevertex);
 		}
@@ -1419,7 +1444,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_vertex_id_zero_base:
 		if (!ctx->vertex_id) {
-			ctx->vertex_id = create_input(ctx->block, NULL, 0);
+			ctx->vertex_id = create_input(ctx->block, 0);
 			add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE,
 					ctx->vertex_id);
 		}
@@ -1427,7 +1452,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		break;
 	case nir_intrinsic_load_instance_id:
 		if (!ctx->instance_id) {
-			ctx->instance_id = create_input(ctx->block, NULL, 0);
+			ctx->instance_id = create_input(ctx->block, 0);
 			add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID,
 					ctx->instance_id);
 		}
@@ -1456,7 +1481,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		kill = ir3_KILL(b, cond, 0);
 		array_insert(ctx->ir->predicates, kill);
 
-		ctx->kill[ctx->kill_count++] = kill;
+		array_insert(ctx->ir->keeps, kill);
 		ctx->so->has_kill = true;
 
 		break;
@@ -1950,6 +1975,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
 	}
 }
 
+/* emit stream-out code.  At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block.  We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ *   blockOrigEnd {
+ *      p0.x = (vtxcnt < maxvtxcnt)
+ *      // succs: blockStreamOut, blockNewEnd
+ *   }
+ *   blockStreamOut {
+ *      ... stream-out instructions ...
+ *      // succs: blockNewEnd
+ *   }
+ *   blockNewEnd {
+ *   }
+ */
+static void
+emit_stream_out(struct ir3_compile *ctx)
+{
+	struct ir3_shader_variant *v = ctx->so;
+	struct ir3 *ir = ctx->ir;
+	struct pipe_stream_output_info *strmout =
+			&ctx->so->shader->stream_output;
+	struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+	struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+	struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS];
+
+	/* create vtxcnt input in input block at top of shader,
+	 * so that it is seen as live over the entire duration
+	 * of the shader:
+	 */
+	vtxcnt = create_input(ctx->in_block, 0);
+	add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt);
+
+	maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+	/* at this point, we are at the original 'end' block,
+	 * re-purpose this block to stream-out condition, then
+	 * append stream-out block and new-end block
+	 */
+	orig_end_block = ctx->block;
+
+	stream_out_block = ir3_block_create(ir);
+	list_addtail(&stream_out_block->node, &ir->block_list);
+
+	new_end_block = ir3_block_create(ir);
+	list_addtail(&new_end_block->node, &ir->block_list);
+
+	orig_end_block->successors[0] = stream_out_block;
+	orig_end_block->successors[1] = new_end_block;
+	stream_out_block->successors[0] = new_end_block;
+
+	/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+	cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+	cond->regs[0]->num = regid(REG_P0, 0);
+	cond->cat2.condition = IR3_COND_LT;
+
+	/* condition goes on previous block to the conditional,
+	 * since it is used to pick which of the two successor
+	 * paths to take:
+	 */
+	orig_end_block->condition = cond;
+
+	/* switch to stream_out_block to generate the stream-out
+	 * instructions:
+	 */
+	ctx->block = stream_out_block;
+
+	/* Calculate base addresses based on vtxcnt.  Instructions
+	 * generated for bases not used in following loop will be
+	 * stripped out in the backend.
+	 */
+	for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+		unsigned stride = strmout->stride[i];
+		struct ir3_instruction *base, *off;
+
+		base = create_uniform(ctx, regid(v->first_driver_param + 5, i));
+
+		/* 24-bit should be enough: */
+		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+				create_immed(ctx->block, stride * 4), 0);
+
+		bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+	}
+
+	/* Generate the per-output store instructions: */
+	for (unsigned i = 0; i < strmout->num_outputs; i++) {
+		for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+			unsigned c = j + strmout->output[i].start_component;
+			struct ir3_instruction *base, *out, *stg;
+
+			base = bases[strmout->output[i].output_buffer];
+			out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+			stg = ir3_STG(ctx->block, base, 0, out, 0,
+					create_immed(ctx->block, 1), 0);
+			stg->cat6.type = TYPE_U32;
+			stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
+
+			array_insert(ctx->ir->keeps, stg);
+		}
+	}
+
+	/* and finally switch to the new_end_block: */
+	ctx->block = new_end_block;
+}
+
 static void
 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 {
@@ -1960,6 +2094,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
 	 * into which we emit the 'end' instruction.
 	 */
 	compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+	/* If stream-out (aka transform-feedback) enabled, emit the
+	 * stream-out instructions, followed by a new empty block (into
+	 * which the 'end' instruction lands).
+	 *
+	 * NOTE: it is done in this order, rather than inserting before
+	 * we emit end_block, because NIR guarantees that all blocks
+	 * flow into end_block, and that end_block has no successors.
+	 * So by re-purposing end_block as the first block of stream-
+	 * out, we guarantee that all exit paths flow into the stream-
+	 * out instructions.
+	 */
+	if ((ctx->so->shader->stream_output.num_outputs > 0) &&
+			!ctx->so->key.binning_pass) {
+		debug_assert(ctx->so->type == SHADER_VERTEX);
+		emit_stream_out(ctx);
+	}
+
 	ir3_END(ctx->block);
 }
 
@@ -1974,7 +2126,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 	unsigned semantic_index = in->data.index;
 	unsigned n = in->data.driver_location;
 
-	DBG("; in: %u:%u, len=%ux%u, loc=%u\n",
+	DBG("; in: %u:%u, len=%ux%u, loc=%u",
 			semantic_name, semantic_index, array_len,
 			ncomp, n);
 
@@ -2045,7 +2197,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 						so->inputs[n].inloc + i - 8, use_ldlv);
 			}
 		} else {
-			instr = create_input(ctx->block, NULL, idx);
+			instr = create_input(ctx->block, idx);
 		}
 
 		ctx->ir->inputs[idx] = instr;
@@ -2069,7 +2221,7 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 	unsigned n = out->data.driver_location;
 	unsigned comp = 0;
 
-	DBG("; out: %u:%u, len=%ux%u, loc=%u\n",
+	DBG("; out: %u:%u, len=%ux%u, loc=%u",
 			semantic_name, semantic_index, array_len,
 			ncomp, n);
 
@@ -2098,6 +2250,10 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 			so->writes_pos = true;
 			break;
 		case TGSI_SEMANTIC_COLOR:
+			if (semantic_index == -1) {
+				semantic_index = 0;
+				so->color0_mrt = 1;
+			}
 			break;
 		default:
 			compile_error(ctx, "unknown FS semantic name: %s\n",
@@ -2136,13 +2292,9 @@ emit_instructions(struct ir3_compile *ctx)
 	ninputs  = exec_list_length(&ctx->s->inputs) * 4;
 	noutputs = exec_list_length(&ctx->s->outputs) * 4;
 
-	/* we need to allocate big enough outputs array so that
-	 * we can stuff the kill's at the end.  Likewise for vtx
-	 * shaders, we need to leave room for sysvals:
+	/* or vtx shaders, we need to leave room for sysvals:
 	 */
-	if (ctx->so->type == SHADER_FRAGMENT) {
-		noutputs += ARRAY_SIZE(ctx->kill);
-	} else if (ctx->so->type == SHADER_VERTEX) {
+	if (ctx->so->type == SHADER_VERTEX) {
 		ninputs += 8;
 	}
 
@@ -2153,9 +2305,7 @@ emit_instructions(struct ir3_compile *ctx)
 	ctx->in_block = ctx->block;
 	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
-	if (ctx->so->type == SHADER_FRAGMENT) {
-		ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
-	} else if (ctx->so->type == SHADER_VERTEX) {
+	if (ctx->so->type == SHADER_VERTEX) {
 		ctx->ir->ninputs -= 8;
 	}
 
@@ -2254,13 +2404,13 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 	so->pos_regid = regid;
 
 	/* r0.x */
-	instr = create_input(ctx->in_block, NULL, ir->ninputs);
+	instr = create_input(ctx->in_block, ir->ninputs);
 	instr->regs[0]->num = regid++;
 	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[1]->instr = instr;
 
 	/* r0.y */
-	instr = create_input(ctx->in_block, NULL, ir->ninputs);
+	instr = create_input(ctx->in_block, ir->ninputs);
 	instr->regs[0]->num = regid++;
 	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[2]->instr = instr;
@@ -2270,9 +2420,7 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 
 int
 ir3_compile_shader_nir(struct ir3_compiler *compiler,
-		struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens,
-		struct ir3_shader_key key)
+		struct ir3_shader_variant *so)
 {
 	struct ir3_compile *ctx;
 	struct ir3 *ir;
@@ -2282,7 +2430,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	assert(!so->ir);
 
-	ctx = compile_init(compiler, so, tokens);
+	ctx = compile_init(compiler, so, so->shader->tokens);
 	if (!ctx) {
 		DBG("INIT failed!");
 		ret = -1;
@@ -2307,7 +2455,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		fixup_frag_inputs(ctx);
 
 	/* at this point, for binning pass, throw away unneeded outputs: */
-	if (key.binning_pass) {
+	if (so->key.binning_pass) {
 		for (i = 0, j = 0; i < so->outputs_count; i++) {
 			unsigned name = sem2name(so->outputs[i].semantic);
 			unsigned idx = sem2idx(so->outputs[i].semantic);
@@ -2332,7 +2480,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	/* if we want half-precision outputs, mark the output registers
 	 * as half:
 	 */
-	if (key.half_precision) {
+	if (so->key.half_precision) {
 		for (i = 0; i < ir->noutputs; i++) {
 			struct ir3_instruction *out = ir->outputs[i];
 			if (!out)
@@ -2353,15 +2501,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		}
 	}
 
-	/* at this point, we want the kill's in the outputs array too,
-	 * so that they get scheduled (since they have no dst).. we've
-	 * already ensured that the array is big enough in push_block():
-	 */
-	if (so->type == SHADER_FRAGMENT) {
-		for (i = 0; i < ctx->kill_count; i++)
-			ir->outputs[ir->noutputs++] = ctx->kill[i];
-	}
-
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("BEFORE CP:\n");
 		ir3_print(ir);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 8c7c80f7aae..be4e4e81109 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -291,7 +291,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 			instr->regs[n+1] = src_reg;
 
 			if (src_reg->flags & IR3_REG_RELATIV)
-				instr->address = reg->instr->address;
+				ir3_instr_set_address(instr, reg->instr->address);
 
 			return;
 		}
@@ -300,7 +300,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 				!conflicts(instr->address, reg->instr->address)) {
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;
-			instr->address = reg->instr->address;
+			ir3_instr_set_address(instr, reg->instr->address);
 
 			return;
 		}
@@ -389,7 +389,7 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
 	}
 
 	if (instr->address)
-		instr->address = instr_cp(instr->address, NULL);
+		ir3_instr_set_address(instr, instr_cp(instr->address, NULL));
 
 	return instr;
 }
@@ -408,6 +408,10 @@ ir3_cp(struct ir3 *ir)
 		}
 	}
 
+	for (unsigned i = 0; i < ir->keeps_count; i++) {
+		ir->keeps[i] = instr_cp(ir->keeps[i], NULL);
+	}
+
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		if (block->condition)
 			block->condition = instr_cp(block->condition, NULL);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 3a108243479..97df0c2ac99 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -156,6 +156,9 @@ ir3_depth(struct ir3 *ir)
 		if (ir->outputs[i])
 			ir3_instr_depth(ir->outputs[i]);
 
+	for (i = 0; i < ir->keeps_count; i++)
+		ir3_instr_depth(ir->keeps[i]);
+
 	/* We also need to account for if-condition: */
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 		if (block->condition)
@@ -167,6 +170,15 @@ ir3_depth(struct ir3 *ir)
 		remove_unused_by_block(block);
 	}
 
+	/* note that we can end up with unused indirects, but we should
+	 * not end up with unused predicates.
+	 */
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *instr = ir->indirects[i];
+		if (instr->depth == DEPTH_UNUSED)
+			ir->indirects[i] = NULL;
+	}
+
 	/* cleanup unused inputs: */
 	for (i = 0; i < ir->ninputs; i++) {
 		struct ir3_instruction *in = ir->inputs[i];
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index 70d9b08e019..ca28aefd502 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -236,6 +236,11 @@ find_neighbors(struct ir3 *ir)
 			instr_find_neighbors(instr);
 		}
 	}
+
+	for (i = 0; i < ir->keeps_count; i++) {
+		struct ir3_instruction *instr = ir->keeps[i];
+		instr_find_neighbors(instr);
+	}
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index f4a4223ae17..e94293f6d6b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -182,14 +182,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 			 */
 			ctx->has_samp = true;
 			regmask_set(&needs_sy, n->regs[0]);
-		} else if (is_mem(n)) {
+		} else if (is_load(n)) {
 			regmask_set(&needs_sy, n->regs[0]);
 		}
 
 		/* both tex/sfu appear to not always immediately consume
 		 * their src register(s):
 		 */
-		if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+		if (is_tex(n) || is_sfu(n) || is_load(n)) {
 			foreach_src(reg, n) {
 				if (reg_gpr(reg))
 					regmask_set(&needs_ss_war, reg);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index f377982dd5e..07e03d26908 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -175,6 +175,20 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}
 
+	if (instr->cp.left) {
+		printf(", left=_");
+		printf("[");
+		print_instr_name(instr->cp.left);
+		printf("]");
+	}
+
+	if (instr->cp.right) {
+		printf(", right=_");
+		printf("[");
+		print_instr_name(instr->cp.right);
+		printf("]");
+	}
+
 	if (is_meta(instr)) {
 		if (instr->opc == OPC_META_FO) {
 			printf(", off=%d", instr->fo.off);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index e5aba859fab..eaf3b3c35e8 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -241,6 +241,21 @@ ir3_ra_alloc_reg_set(void *memctx)
 	return set;
 }
 
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+	BITSET_WORD *def;        /* variables defined before used in block */
+	BITSET_WORD *use;        /* variables used before defined in block */
+	BITSET_WORD *livein;     /* which defs reach entry point of block */
+	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+	/* cached instruction 'definer' info: */
+	struct ir3_instruction *defn;
+	int off, sz, cls;
+};
+
 /* register-assign context, per-shader */
 struct ir3_ra_ctx {
 	struct ir3 *ir;
@@ -254,14 +269,7 @@ struct ir3_ra_ctx {
 	unsigned class_base[total_class_count];
 	unsigned instr_cnt;
 	unsigned *def, *use;     /* def/use table */
-};
-
-/* additional block-data (per-block) */
-struct ir3_ra_block_data {
-	BITSET_WORD *def;        /* variables defined before used in block */
-	BITSET_WORD *use;        /* variables used before defined in block */
-	BITSET_WORD *livein;     /* which defs reach entry point of block */
-	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+	struct ir3_ra_instr_data *instrd;
 };
 
 static bool
@@ -291,8 +299,6 @@ is_temp(struct ir3_register *reg)
 {
 	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 		return false;
-	if (reg->flags & IR3_REG_RELATIV) // TODO
-		return false;
 	if ((reg->num == regid(REG_A0, 0)) ||
 			(reg->num == regid(REG_P0, 0)))
 		return false;
@@ -309,28 +315,45 @@ writes_gpr(struct ir3_instruction *instr)
 }
 
 static struct ir3_instruction *
-get_definer(struct ir3_instruction *instr, int *sz, int *off)
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+		int *sz, int *off)
 {
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 	struct ir3_instruction *d = NULL;
+
+	if (instr->fanin)
+		return get_definer(ctx, instr->fanin, sz, off);
+
+	if (id->defn) {
+		*sz = id->sz;
+		*off = id->off;
+		return id->defn;
+	}
+
 	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
 		/* What about the case where collect is subset of array, we
 		 * need to find the distance between where actual array starts
 		 * and fanin..  that probably doesn't happen currently.
 		 */
 		struct ir3_register *src;
+		int dsz, doff;
 
 		/* note: don't use foreach_ssa_src as this gets called once
 		 * while assigning regs (which clears SSA flag)
 		 */
-		foreach_src(src, instr) {
+		foreach_src_n(src, n, instr) {
+			struct ir3_instruction *dd;
 			if (!src->instr)
 				continue;
-			if ((!d) || (src->instr->ip < d->ip))
-				d = src->instr;
-		}
 
-		*sz = instr->regs_count - 1;
-		*off = 0;
+			dd = get_definer(ctx, src->instr, &dsz, &doff);
+
+			if ((!d) || (dd->ip < d->ip)) {
+				d = dd;
+				*sz = dsz;
+				*off = doff - n;
+			}
+		}
 
 	} else if (instr->cp.right || instr->cp.left) {
 		/* covers also the meta:fo case, which ends up w/ single
@@ -386,7 +409,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		struct ir3_instruction *dd;
 		int dsz, doff;
 
-		dd = get_definer(phi, &dsz, &doff);
+		dd = get_definer(ctx, phi, &dsz, &doff);
 
 		*sz = MAX2(*sz, dsz);
 		*off = doff;
@@ -401,6 +424,7 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		 * the phi, so we don't need to chase definers
 		 */
 		struct ir3_register *src;
+		struct ir3_instruction *dd = d;
 
 		/* note: don't use foreach_ssa_src as this gets called once
 		 * while assigning regs (which clears SSA flag)
@@ -408,16 +432,18 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		foreach_src(src, d) {
 			if (!src->instr)
 				continue;
-			if (src->instr->ip < d->ip)
-				d = src->instr;
+			if (src->instr->ip < dd->ip)
+				dd = src->instr;
 		}
+
+		d = dd;
 	}
 
 	if (is_meta(d) && (d->opc == OPC_META_FO)) {
 		struct ir3_instruction *dd;
 		int dsz, doff;
 
-		dd = get_definer(d->regs[1]->instr, &dsz, &doff);
+		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
 
 		/* by definition, should come before: */
 		debug_assert(dd->ip < d->ip);
@@ -429,9 +455,30 @@ get_definer(struct ir3_instruction *instr, int *sz, int *off)
 		d = dd;
 	}
 
+	id->defn = d;
+	id->sz = *sz;
+	id->off = *off;
+
 	return d;
 }
 
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+		if (instr->regs_count == 0)
+			continue;
+		/* couple special cases: */
+		if (writes_addr(instr) || writes_pred(instr)) {
+			id->cls = -1;
+			continue;
+		}
+		id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+		id->cls = size_to_class(id->sz, is_half(id->defn));
+	}
+}
+
 /* give each instruction a name (and ip), and count up the # of names
  * of each class
  */
@@ -439,8 +486,11 @@ static void
 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		struct ir3_instruction *defn;
-		int cls, sz, off;
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+
+#ifdef DEBUG
+		instr->name = ~0;
+#endif
 
 		ctx->instr_cnt++;
 
@@ -450,9 +500,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (!writes_gpr(instr))
 			continue;
 
-		defn = get_definer(instr, &sz, &off);
-
-		if (defn != instr)
+		if (id->defn != instr)
 			continue;
 
 		/* arrays which don't fit in one of the pre-defined class
@@ -460,9 +508,8 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		 *
 		 * TODO but we still need to allocate names for them, don't we??
 		 */
-		cls = size_to_class(sz, is_half(defn));
-		if (cls >= 0) {
-			instr->name = ctx->class_alloc_count[cls]++;
+		if (id->cls >= 0) {
+			instr->name = ctx->class_alloc_count[id->cls]++;
 			ctx->alloc_count++;
 		}
 	}
@@ -471,8 +518,16 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static void
 ra_init(struct ir3_ra_ctx *ctx)
 {
+	unsigned n;
+
 	ir3_clear_mark(ctx->ir);
-	ir3_count_instructions(ctx->ir);
+	n = ir3_count_instructions(ctx->ir);
+
+	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_find_definers(ctx, block);
+	}
 
 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 		ra_block_name_instructions(ctx, block);
@@ -488,6 +543,7 @@ ra_init(struct ir3_ra_ctx *ctx)
 	}
 
 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+	ralloc_steal(ctx->g, ctx->instrd);
 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 }
@@ -555,39 +611,36 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		 */
 
 		if (writes_gpr(instr)) {
-			struct ir3_instruction *defn;
-			int cls, sz, off;
+			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-			defn = get_definer(instr, &sz, &off);
-			if (defn == instr) {
+			if (id->defn == instr) {
 				/* arrays which don't fit in one of the pre-defined class
 				 * sizes are pre-colored:
 				 */
-				cls = size_to_class(sz, is_half(defn));
-				if (cls >= 0) {
-					unsigned name = ra_name(ctx, cls, defn);
+				if (id->cls >= 0) {
+					unsigned name = ra_name(ctx, id->cls, id->defn);
 
-					ctx->def[name] = defn->ip;
-					ctx->use[name] = defn->ip;
+					ctx->def[name] = id->defn->ip;
+					ctx->use[name] = id->defn->ip;
 
 					/* since we are in SSA at this point: */
 					debug_assert(!BITSET_TEST(bd->use, name));
 
 					BITSET_SET(bd->def, name);
 
-					if (is_half(defn)) {
+					if (is_half(id->defn)) {
 						ra_set_node_class(ctx->g, name,
-								ctx->set->half_classes[cls - class_count]);
+								ctx->set->half_classes[id->cls - class_count]);
 					} else {
 						ra_set_node_class(ctx->g, name,
-								ctx->set->classes[cls]);
+								ctx->set->classes[id->cls]);
 					}
 
 					/* extend the live range for phi srcs, which may come
 					 * from the bottom of the loop
 					 */
-					if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
-						struct ir3_instruction *phi = defn->regs[0]->instr;
+					if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+						struct ir3_instruction *phi = id->defn->regs[0]->instr;
 						foreach_ssa_src(src, phi) {
 							/* if src is after phi, then we need to extend
 							 * the liverange to the end of src's block:
@@ -606,13 +659,10 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 
 		foreach_ssa_src(src, instr) {
 			if (writes_gpr(src)) {
-				struct ir3_instruction *srcdefn;
-				int cls, sz, off;
+				struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
 
-				srcdefn = get_definer(src, &sz, &off);
-				cls = size_to_class(sz, is_half(srcdefn));
-				if (cls >= 0) {
-					unsigned name = ra_name(ctx, cls, srcdefn);
+				if (id->cls >= 0) {
+					unsigned name = ra_name(ctx, id->cls, id->defn);
 					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
 					if (!BITSET_TEST(bd->def, name))
 						BITSET_SET(bd->use, name);
@@ -704,13 +754,10 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	/* need to fix things up to keep outputs live: */
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		struct ir3_instruction *instr = ir->outputs[i];
-		struct ir3_instruction *defn;
-		int cls, sz, off;
+		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-		defn = get_definer(instr, &sz, &off);
-		cls = size_to_class(sz, is_half(defn));
-		if (cls >= 0) {
-			unsigned name = ra_name(ctx, cls, defn);
+		if (id->cls >= 0) {
+			unsigned name = ra_name(ctx, id->cls, id->defn);
 			ctx->use[name] = ctx->instr_cnt;
 		}
 	}
@@ -780,15 +827,12 @@ static void
 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		struct ir3_instruction *instr)
 {
-	struct ir3_instruction *defn;
-	int cls, sz, off;
+	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-	defn = get_definer(instr, &sz, &off);
-	cls = size_to_class(sz, is_half(defn));
-	if (cls >= 0) {
-		unsigned name = ra_name(ctx, cls, defn);
+	if (id->cls >= 0) {
+		unsigned name = ra_name(ctx, id->cls, id->defn);
 		unsigned r = ra_get_node_reg(ctx->g, name);
-		unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
+		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 
 		if (reg->flags & IR3_REG_RELATIV)
 			num += reg->offset;
@@ -796,7 +840,7 @@ reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		reg->num = num;
 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 
-		if (is_half(defn))
+		if (is_half(id->defn))
 			reg->flags |= IR3_REG_HALF;
 	}
 }
@@ -851,19 +895,16 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 		for (j = 0; i < ir->ninputs; i++) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			if (instr) {
-				struct ir3_instruction *defn;
-				int cls, sz, off;
+				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 
-				defn = get_definer(instr, &sz, &off);
-				if (defn == instr) {
+				if (id->defn == instr) {
 					unsigned name, reg;
 
-					cls = size_to_class(sz, is_half(defn));
-					name = ra_name(ctx, cls, defn);
-					reg = ctx->set->gpr_to_ra_reg[cls][j];
+					name = ra_name(ctx, id->cls, id->defn);
+					reg = ctx->set->gpr_to_ra_reg[id->cls][j];
 
 					ra_set_node_reg(ctx->g, name, reg);
-					j += sz;
+					j += id->sz;
 				}
 			}
 		}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 49a4426d163..2ee325518f7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -80,12 +80,12 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 	list_delinit(&instr->node);
 
 	if (writes_addr(instr)) {
-		assert(ctx->addr == NULL);
+		debug_assert(ctx->addr == NULL);
 		ctx->addr = instr;
 	}
 
 	if (writes_pred(instr)) {
-		assert(ctx->pred == NULL);
+		debug_assert(ctx->pred == NULL);
 		ctx->pred = instr;
 	}
 
@@ -180,13 +180,13 @@ check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	 * free:
 	 */
 	if (writes_addr(instr) && ctx->addr) {
-		assert(ctx->addr != instr);
+		debug_assert(ctx->addr != instr);
 		notes->addr_conflict = true;
 		return true;
 	}
 
 	if (writes_pred(instr) && ctx->pred) {
-		assert(ctx->pred != instr);
+		debug_assert(ctx->pred != instr);
 		notes->pred_conflict = true;
 		return true;
 	}
@@ -261,6 +261,20 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	return 0;
 }
 
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+	struct ir3_instruction *other_src;
+	foreach_ssa_src(other_src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if ((src != other_src) && !is_scheduled(other_src)) {
+			return false;
+		}
+	}
+	return true;
+}
+
 /* move eligible instructions to the priority list: */
 static unsigned
 add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
@@ -272,6 +286,31 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		int e = instr_eligibility(ctx, notes, instr);
 		if (e < 0)
 			continue;
+
+		/* For instructions that write address register we need to
+		 * make sure there is at least one instruction that uses the
+		 * addr value which is otherwise ready.
+		 *
+		 * TODO if any instructions use pred register and have other
+		 * src args, we would need to do the same for writes_pred()..
+		 */
+		if (unlikely(writes_addr(instr))) {
+			struct ir3 *ir = instr->block->shader;
+			bool ready = false;
+			for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+				struct ir3_instruction *indirect = ir->indirects[i];
+				if (!indirect)
+					continue;
+				if (indirect->address != instr)
+					continue;
+				ready = could_sched(indirect, instr);
+			}
+
+			/* nothing could be scheduled, so keep looking: */
+			if (!ready)
+				continue;
+		}
+
 		min_delay = MIN2(min_delay, e);
 		if (e == 0) {
 			/* remove from unscheduled list and into priority queue: */
@@ -287,20 +326,25 @@ add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
  * instructions which depend on the current address register
  * to a clone of the instruction which wrote the address reg.
  */
-static void
+static struct ir3_instruction *
 split_addr(struct ir3_sched_ctx *ctx)
 {
-	struct ir3 *ir = ctx->addr->block->shader;
+	struct ir3 *ir;
 	struct ir3_instruction *new_addr = NULL;
 	unsigned i;
 
 	debug_assert(ctx->addr);
 
+	ir = ctx->addr->block->shader;
+
 	for (i = 0; i < ir->indirects_count; i++) {
 		struct ir3_instruction *indirect = ir->indirects[i];
 
+		if (!indirect)
+			continue;
+
 		/* skip instructions already scheduled: */
-		if (indirect->flags & IR3_INSTR_MARK)
+		if (is_scheduled(indirect))
 			continue;
 
 		/* remap remaining instructions using current addr
@@ -312,32 +356,36 @@ split_addr(struct ir3_sched_ctx *ctx)
 				/* original addr is scheduled, but new one isn't: */
 				new_addr->flags &= ~IR3_INSTR_MARK;
 			}
-			indirect->address = new_addr;
+			ir3_instr_set_address(indirect, new_addr);
 		}
 	}
 
 	/* all remaining indirects remapped to new addr: */
 	ctx->addr = NULL;
+
+	return new_addr;
 }
 
 /* "spill" the predicate register by remapping any unscheduled
  * instructions which depend on the current predicate register
  * to a clone of the instruction which wrote the address reg.
  */
-static void
+static struct ir3_instruction *
 split_pred(struct ir3_sched_ctx *ctx)
 {
-	struct ir3 *ir = ctx->pred->block->shader;
+	struct ir3 *ir;
 	struct ir3_instruction *new_pred = NULL;
 	unsigned i;
 
 	debug_assert(ctx->pred);
 
+	ir = ctx->pred->block->shader;
+
 	for (i = 0; i < ir->predicates_count; i++) {
 		struct ir3_instruction *predicated = ir->predicates[i];
 
 		/* skip instructions already scheduled: */
-		if (predicated->flags & IR3_INSTR_MARK)
+		if (is_scheduled(predicated))
 			continue;
 
 		/* remap remaining instructions using current pred
@@ -358,6 +406,8 @@ split_pred(struct ir3_sched_ctx *ctx)
 
 	/* all remaining predicated remapped to new pred: */
 	ctx->pred = NULL;
+
+	return new_pred;
 }
 
 static void
@@ -407,20 +457,32 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
 			schedule(ctx, instr);
 		} else if (delay == ~0) {
+			struct ir3_instruction *new_instr = NULL;
+
 			/* nothing available to schedule.. if we are blocked on
 			 * address/predicate register conflict, then break the
 			 * deadlock by cloning the instruction that wrote that
 			 * reg:
 			 */
 			if (notes.addr_conflict) {
-				split_addr(ctx);
+				new_instr = split_addr(ctx);
 			} else if (notes.pred_conflict) {
-				split_pred(ctx);
+				new_instr = split_pred(ctx);
 			} else {
 				debug_assert(0);
 				ctx->error = true;
 				return;
 			}
+
+			if (new_instr) {
+				list_del(&new_instr->node);
+				list_addtail(&new_instr->node, &unscheduled_list);
+				/* the original instr that wrote addr/pred may have
+				 * originated from a different block:
+				 */
+				new_instr->block = block;
+			}
+
 		} else {
 			/* and if we run out of instructions that can be scheduled,
 			 * then it is time for nop's:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index b5b038100cc..312174c0c6d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -46,7 +46,8 @@ delete_variant(struct ir3_shader_variant *v)
 {
 	if (v->ir)
 		ir3_destroy(v->ir);
-	fd_bo_del(v->bo);
+	if (v->bo)
+		fd_bo_del(v->bo);
 	free(v);
 }
 
@@ -139,6 +140,32 @@ assemble_variant(struct ir3_shader_variant *v)
 
 	memcpy(fd_bo_map(v->bo), bin, sz);
 
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		struct ir3_shader_key key = v->key;
+		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+			key.binning_pass, key.color_two_side, key.half_precision);
+		ir3_shader_disasm(v, bin);
+	}
+
+	if (fd_mesa_debug & FD_DBG_SHADERDB) {
+		/* print generic shader info: */
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u instructions, %u dwords\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.instrs_count,
+				v->info.sizedwords);
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u half, %u full\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.max_half_reg + 1,
+				v->info.max_reg + 1);
+		fprintf(stderr, "SHADER-DB: %s prog %d/%d: %u const, %u constlen\n",
+				ir3_shader_stage(v->shader),
+				v->shader->id, v->id,
+				v->info.max_const + 1,
+				v->constlen);
+	}
+
 	free(bin);
 
 	/* no need to keep the ir around beyond this point: */
@@ -150,12 +177,12 @@ static struct ir3_shader_variant *
 create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 {
 	struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
-	const struct tgsi_token *tokens = shader->tokens;
 	int ret;
 
 	if (!v)
 		return NULL;
 
+	v->id = ++shader->variant_count;
 	v->shader = shader;
 	v->key = key;
 	v->type = shader->type;
@@ -163,10 +190,10 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
 			key.binning_pass, key.color_two_side, key.half_precision);
-		tgsi_dump(tokens, 0);
+		tgsi_dump(shader->tokens, 0);
 	}
 
-	ret = ir3_compile_shader_nir(shader->compiler, v, tokens, key);
+	ret = ir3_compile_shader_nir(shader->compiler, v);
 	if (ret) {
 		debug_error("compile failed!");
 		goto fail;
@@ -178,12 +205,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 		goto fail;
 	}
 
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
-			key.binning_pass, key.color_two_side, key.half_precision);
-		disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
-	}
-
 	return v;
 
 fail:
@@ -228,8 +249,10 @@ ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 
 	/* compile new variant if it doesn't exist already: */
 	v = create_variant(shader, key);
-	v->next = shader->variants;
-	shader->variants = v;
+	if (v) {
+		v->next = shader->variants;
+		shader->variants = v;
+	}
 
 	return v;
 }
@@ -249,13 +272,372 @@ ir3_shader_destroy(struct ir3_shader *shader)
 }
 
 struct ir3_shader *
-ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
+ir3_shader_create(struct pipe_context *pctx,
+		const struct pipe_shader_state *cso,
 		enum shader_t type)
 {
 	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
 	shader->compiler = fd_context(pctx)->screen->compiler;
+	shader->id = ++shader->compiler->shader_count;
 	shader->pctx = pctx;
 	shader->type = type;
-	shader->tokens = tgsi_dup_tokens(tokens);
+	shader->tokens = tgsi_dup_tokens(cso->tokens);
+	shader->stream_output = cso->stream_output;
+	if (fd_mesa_debug & FD_DBG_SHADERDB) {
+		/* if shader-db run, create a standard variant immediately
+		 * (as otherwise nothing will trigger the shader to be
+		 * actually compiled)
+		 */
+		static struct ir3_shader_key key = {};
+		ir3_shader_variant(shader, key);
+	}
 	return shader;
 }
+
+static void dump_reg(const char *name, uint32_t r)
+{
+	if (r != regid(63,0))
+		debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
+}
+
+static void dump_semantic(struct ir3_shader_variant *so,
+		unsigned sem, const char *name)
+{
+	uint32_t regid;
+	regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0));
+	dump_reg(name, regid);
+}
+
+void
+ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
+{
+	struct ir3 *ir = so->ir;
+	struct ir3_register *reg;
+	const char *type = ir3_shader_stage(so->shader);
+	uint8_t regid;
+	unsigned i;
+
+	for (i = 0; i < ir->ninputs; i++) {
+		if (!ir->inputs[i]) {
+			debug_printf("; in%d unused\n", i);
+			continue;
+		}
+		reg = ir->inputs[i]->regs[0];
+		regid = reg->num;
+		debug_printf("@in(%sr%d.%c)\tin%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i]) {
+			debug_printf("; out%d unused\n", i);
+			continue;
+		}
+		/* kill shows up as a virtual output.. skip it! */
+		if (is_kill(ir->outputs[i]))
+			continue;
+		reg = ir->outputs[i]->regs[0];
+		regid = reg->num;
+		debug_printf("@out(%sr%d.%c)\tout%d\n",
+				(reg->flags & IR3_REG_HALF) ? "h" : "",
+				(regid >> 2), "xyzw"[regid & 0x3], i);
+	}
+
+	for (i = 0; i < so->immediates_count; i++) {
+		debug_printf("@const(c%d.x)\t", so->first_immediate + i);
+		debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+				so->immediates[i].val[0],
+				so->immediates[i].val[1],
+				so->immediates[i].val[2],
+				so->immediates[i].val[3]);
+	}
+
+	disasm_a3xx(bin, so->info.sizedwords, 0, so->type);
+
+	debug_printf("; %s: outputs:", type);
+	for (i = 0; i < so->outputs_count; i++) {
+		uint8_t regid = so->outputs[i].regid;
+		ir3_semantic sem = so->outputs[i].semantic;
+		debug_printf(" r%d.%c (%u:%u)",
+				(regid >> 2), "xyzw"[regid & 0x3],
+				sem2name(sem), sem2idx(sem));
+	}
+	debug_printf("\n");
+	debug_printf("; %s: inputs:", type);
+	for (i = 0; i < so->inputs_count; i++) {
+		uint8_t regid = so->inputs[i].regid;
+		ir3_semantic sem = so->inputs[i].semantic;
+		debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)",
+				(regid >> 2), "xyzw"[regid & 0x3],
+				sem2name(sem), sem2idx(sem),
+				so->inputs[i].compmask,
+				so->inputs[i].inloc,
+				so->inputs[i].bary);
+	}
+	debug_printf("\n");
+
+	/* print generic shader info: */
+	debug_printf("; %s prog %d/%d: %u instructions, %d half, %d full\n",
+			type, so->shader->id, so->id,
+			so->info.instrs_count,
+			so->info.max_half_reg + 1,
+			so->info.max_reg + 1);
+
+	debug_printf("; %d const, %u constlen\n",
+			so->info.max_const + 1,
+			so->constlen);
+
+	/* print shader type specific info: */
+	switch (so->type) {
+	case SHADER_VERTEX:
+		dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos");
+		dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize");
+		break;
+	case SHADER_FRAGMENT:
+		dump_reg("pos (bary)", so->pos_regid);
+		dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz");
+		dump_semantic(so, TGSI_SEMANTIC_COLOR, "color");
+		/* these two are hard-coded since we don't know how to
+		 * program them to anything but all 0's...
+		 */
+		if (so->frag_coord)
+			debug_printf("; fragcoord: r0.x\n");
+		if (so->frag_face)
+			debug_printf("; fragface: hr0.x\n");
+		break;
+	case SHADER_COMPUTE:
+		break;
+	}
+
+	debug_printf("\n");
+}
+
+/* This has to reach into the fd_context a bit more than the rest of
+ * ir3, but it needs to be aligned with the compiler, so both agree
+ * on which const regs hold what.  And the logic is identical between
+ * a3xx/a4xx, the only difference is small details in the actual
+ * CP_LOAD_STATE packets (which is handled inside the generation
+ * specific ctx->emit_const(_bo)() fxns)
+ */
+
+#include "freedreno_resource.h"
+
+static void
+emit_user_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_constbuf_stateobj *constbuf)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	const unsigned index = 0;     /* user consts are index 0 */
+	/* TODO save/restore dirty_mask for binning pass instead: */
+	uint32_t dirty_mask = constbuf->enabled_mask;
+
+	if (dirty_mask & (1 << index)) {
+		struct pipe_constant_buffer *cb = &constbuf->cb[index];
+		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
+
+		/* in particular, with binning shader we may end up with
+		 * unused consts, ie. we could end up w/ constlen that is
+		 * smaller than first_driver_param.  In that case truncate
+		 * the user consts early to avoid HLSQ lockup caused by
+		 * writing too many consts
+		 */
+		uint32_t max_const = MIN2(v->first_driver_param, v->constlen);
+
+		// I expect that size should be a multiple of vec4's:
+		assert(size == align(size, 4));
+
+		/* and even if the start of the const buffer is before
+		 * first_immediate, the end may not be:
+		 */
+		size = MIN2(size, 4 * max_const);
+
+		if (size > 0) {
+			fd_wfi(ctx, ring);
+			ctx->emit_const(ring, v->type, 0,
+					cb->buffer_offset, size,
+					cb->user_buffer, cb->buffer);
+			constbuf->dirty_mask &= ~(1 << index);
+		}
+	}
+}
+
+static void
+emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_constbuf_stateobj *constbuf)
+{
+	uint32_t offset = v->first_driver_param;  /* UBOs after user consts */
+	if (v->constlen > offset) {
+		struct fd_context *ctx = fd_context(v->shader->pctx);
+		uint32_t params = MIN2(4, v->constlen - offset) * 4;
+		uint32_t offsets[params];
+		struct fd_bo *bos[params];
+
+		for (uint32_t i = 0; i < params; i++) {
+			const uint32_t index = i + 1;   /* UBOs start at index 1 */
+			struct pipe_constant_buffer *cb = &constbuf->cb[index];
+			assert(!cb->user_buffer);
+
+			if ((constbuf->enabled_mask & (1 << index)) && cb->buffer) {
+				offsets[i] = cb->buffer_offset;
+				bos[i] = fd_resource(cb->buffer)->bo;
+			} else {
+				offsets[i] = 0;
+				bos[i] = NULL;
+			}
+		}
+
+		fd_wfi(ctx, ring);
+		ctx->emit_const_bo(ring, v->type, false, offset * 4, params, bos, offsets);
+	}
+}
+
+static void
+emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	int size = v->immediates_count;
+	uint32_t base = v->first_immediate;
+
+	/* truncate size to avoid writing constants that shader
+	 * does not use:
+	 */
+	size = MIN2(size + base, v->constlen) - base;
+
+	/* convert out of vec4: */
+	base *= 4;
+	size *= 4;
+
+	if (size > 0) {
+		fd_wfi(ctx, ring);
+		ctx->emit_const(ring, v->type, base,
+			0, size, v->immediates[0].val, NULL);
+	}
+}
+
+/* emit stream-out buffers: */
+static void
+emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+	uint32_t offset = v->first_driver_param + 5;  /* streamout addresses after driver-params*/
+	if (v->constlen > offset) {
+		struct fd_context *ctx = fd_context(v->shader->pctx);
+		struct fd_streamout_stateobj *so = &ctx->streamout;
+		struct pipe_stream_output_info *info = &v->shader->stream_output;
+		uint32_t params = 4;
+		uint32_t offsets[params];
+		struct fd_bo *bos[params];
+
+		for (uint32_t i = 0; i < params; i++) {
+			struct pipe_stream_output_target *target = so->targets[i];
+
+			if (target) {
+				offsets[i] = (so->offsets[i] * info->stride[i] * 4) +
+						target->buffer_offset;
+				bos[i] = fd_resource(target->buffer)->bo;
+			} else {
+				offsets[i] = 0;
+				bos[i] = NULL;
+			}
+		}
+
+		fd_wfi(ctx, ring);
+		ctx->emit_const_bo(ring, v->type, true, offset * 4, params, bos, offsets);
+	}
+}
+
+static uint32_t
+max_tf_vtx(struct ir3_shader_variant *v)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+	struct fd_streamout_stateobj *so = &ctx->streamout;
+	struct pipe_stream_output_info *info = &v->shader->stream_output;
+	uint32_t maxvtxcnt = 0x7fffffff;
+
+	if (v->key.binning_pass)
+		return 0;
+	if (v->shader->stream_output.num_outputs == 0)
+		return 0;
+	if (so->num_targets == 0)
+		return 0;
+
+	/* offset to write to is:
+	 *
+	 *   total_vtxcnt = vtxcnt + offsets[i]
+	 *   offset = total_vtxcnt * stride[i]
+	 *
+	 *   offset =   vtxcnt * stride[i]       ; calculated in shader
+	 *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
+	 *
+	 * assuming for each vtx, each target buffer will have data written
+	 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
+	 *
+	 *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
+	 *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
+	 *
+	 * but shader is actually doing a less-than (rather than less-than-
+	 * equal) check, so we can drop the -stride[i].
+	 *
+	 * TODO is assumption about `offset + stride[i]` legit?
+	 */
+	for (unsigned i = 0; i < so->num_targets; i++) {
+		struct pipe_stream_output_target *target = so->targets[i];
+		unsigned stride = info->stride[i] * 4;   /* convert dwords->bytes */
+		if (target) {
+			uint32_t max = target->buffer_size / stride;
+			maxvtxcnt = MIN2(maxvtxcnt, max);
+		}
+	}
+
+	return maxvtxcnt;
+}
+
+void
+ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		const struct pipe_draw_info *info, uint32_t dirty)
+{
+	struct fd_context *ctx = fd_context(v->shader->pctx);
+
+	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
+		struct fd_constbuf_stateobj *constbuf;
+		bool shader_dirty;
+
+		if (v->type == SHADER_VERTEX) {
+			constbuf = &ctx->constbuf[PIPE_SHADER_VERTEX];
+			shader_dirty = !!(ctx->prog.dirty & FD_SHADER_DIRTY_VP);
+		} else if (v->type == SHADER_FRAGMENT) {
+			constbuf = &ctx->constbuf[PIPE_SHADER_FRAGMENT];
+			shader_dirty = !!(ctx->prog.dirty & FD_SHADER_DIRTY_FP);
+		} else {
+			unreachable("bad shader type");
+			return;
+		}
+
+		emit_user_consts(v, ring, constbuf);
+		emit_ubos(v, ring, constbuf);
+		if (shader_dirty)
+			emit_immediates(v, ring);
+	}
+
+	/* emit driver params every time: */
+	/* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
+	if (info && (v->type == SHADER_VERTEX)) {
+		uint32_t offset = v->first_driver_param + 4;  /* driver params after UBOs */
+		if (v->constlen >= offset) {
+			uint32_t vertex_params[4] = {
+				[IR3_DP_VTXID_BASE] = info->indexed ?
+						info->index_bias : info->start,
+				[IR3_DP_VTXCNT_MAX] = max_tf_vtx(v),
+			};
+
+			fd_wfi(ctx, ring);
+			ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
+					ARRAY_SIZE(vertex_params), vertex_params, NULL);
+
+			/* if needed, emit stream-out buffer addresses: */
+			if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
+				emit_tfbos(v, ring);
+			}
+		}
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 9f1b0769180..1bbbdbd224d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -29,9 +29,22 @@
 #ifndef IR3_SHADER_H_
 #define IR3_SHADER_H_
 
+#include "pipe/p_state.h"
+
 #include "ir3.h"
 #include "disasm.h"
 
+/* driver param indices: */
+enum ir3_driver_param {
+	IR3_DP_VTXID_BASE = 0,
+	IR3_DP_VTXCNT_MAX = 1,
+};
+
+/* internal semantic used for passing vtxcnt to vertex shader to
+ * implement transform feedback:
+ */
+#define IR3_SEMANTIC_VTXCNT (TGSI_SEMANTIC_COUNT + 0)
+
 typedef uint16_t ir3_semantic;  /* semantic name + index */
 static inline ir3_semantic
 ir3_semantic_name(uint8_t name, uint16_t index)
@@ -100,6 +113,9 @@ ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
 struct ir3_shader_variant {
 	struct fd_bo *bo;
 
+	/* variant id (for debug) */
+	uint32_t id;
+
 	struct ir3_shader_key key;
 
 	struct ir3_info info;
@@ -192,26 +208,44 @@ struct ir3_shader_variant {
 struct ir3_shader {
 	enum shader_t type;
 
+	/* shader id (for debug): */
+	uint32_t id;
+	uint32_t variant_count;
+
 	struct ir3_compiler *compiler;
 
 	struct pipe_context *pctx;
 	const struct tgsi_token *tokens;
+	struct pipe_stream_output_info stream_output;
 
 	struct ir3_shader_variant *variants;
-
-	/* so far, only used for blit_prog shader.. values for
-	 * VPC_VARYING_PS_REPL[i].MODE
-	 */
-	uint32_t vpsrepl[8];
 };
 
 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
 
 struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
-		const struct tgsi_token *tokens, enum shader_t type);
+		const struct pipe_shader_state *cso, enum shader_t type);
 void ir3_shader_destroy(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key);
+void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin);
+
+struct fd_ringbuffer;
+void ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		const struct pipe_draw_info *info, uint32_t dirty);
+
+static inline const char *
+ir3_shader_stage(struct ir3_shader *shader)
+{
+	switch (shader->type) {
+	case SHADER_VERTEX:     return "VERT";
+	case SHADER_FRAGMENT:   return "FRAG";
+	case SHADER_COMPUTE:    return "CL";
+	default:
+		unreachable("invalid type");
+		return NULL;
+	}
+}
 
 /*
  * Helper/util:
diff --git a/src/gallium/drivers/i915/i915_batchbuffer.h b/src/gallium/drivers/i915/i915_batchbuffer.h
index dcf63543219..6466fa594f9 100644
--- a/src/gallium/drivers/i915/i915_batchbuffer.h
+++ b/src/gallium/drivers/i915/i915_batchbuffer.h
@@ -33,20 +33,20 @@
 
 struct i915_context;
 
-static INLINE size_t
+static inline size_t
 i915_winsys_batchbuffer_space(struct i915_winsys_batchbuffer *batch)
 {
    return batch->size - (batch->ptr - batch->map);
 }
 
-static INLINE boolean
+static inline boolean
 i915_winsys_batchbuffer_check(struct i915_winsys_batchbuffer *batch,
                               size_t dwords)
 {
    return dwords * 4 <= i915_winsys_batchbuffer_space(batch);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_dword_unchecked(struct i915_winsys_batchbuffer *batch,
                                         unsigned dword)
 {
@@ -54,7 +54,7 @@ i915_winsys_batchbuffer_dword_unchecked(struct i915_winsys_batchbuffer *batch,
    batch->ptr += 4;
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_float(struct i915_winsys_batchbuffer *batch,
                               float f)
 {
@@ -64,7 +64,7 @@ i915_winsys_batchbuffer_float(struct i915_winsys_batchbuffer *batch,
    i915_winsys_batchbuffer_dword_unchecked(batch, uif.ui);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch,
                               unsigned dword)
 {
@@ -72,7 +72,7 @@ i915_winsys_batchbuffer_dword(struct i915_winsys_batchbuffer *batch,
    i915_winsys_batchbuffer_dword_unchecked(batch, dword);
 }
 
-static INLINE void
+static inline void
 i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch,
                               void *data,
                               size_t size)
@@ -83,7 +83,7 @@ i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch,
    batch->ptr += size;
 }
 
-static INLINE boolean
+static inline boolean
 i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch,
                              struct i915_winsys_buffer **buffers,
                              int num_of_buffers)
@@ -91,7 +91,7 @@ i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch,
    return batch->iws->validate_buffers(batch, buffers, num_of_buffers);
 }
 
-static INLINE int
+static inline int
 i915_winsys_batchbuffer_reloc(struct i915_winsys_batchbuffer *batch,
                               struct i915_winsys_buffer *buffer,
                               enum i915_winsys_buffer_usage usage,
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 40abf3c577f..c8c7d64f5cb 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -339,7 +339,7 @@ struct i915_context {
 #define I915_DST_VARS                   4
 #define I915_DST_RECT                   8
 
-static INLINE
+static inline
 void i915_set_flush_dirty(struct i915_context *i915, unsigned flush)
 {
    i915->hardware_dirty |= I915_HW_FLUSH;
@@ -408,7 +408,7 @@ struct pipe_context *i915_create_context(struct pipe_screen *screen,
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
  */
-static INLINE struct i915_context *
+static inline struct i915_context *
 i915_context( struct pipe_context *pipe )
 {
    return (struct i915_context *)pipe;
diff --git a/src/gallium/drivers/i915/i915_debug.h b/src/gallium/drivers/i915/i915_debug.h
index 079882c811f..0f12a592ae8 100644
--- a/src/gallium/drivers/i915/i915_debug.h
+++ b/src/gallium/drivers/i915/i915_debug.h
@@ -48,13 +48,13 @@ struct i915_winsys_batchbuffer;
 extern unsigned i915_debug;
 
 #ifdef DEBUG
-static INLINE boolean
+static inline boolean
 I915_DBG_ON(unsigned flags)
 {
    return i915_debug & flags;
 }
 
-static INLINE void
+static inline void
 I915_DBG(unsigned flags, const char *fmt, ...)
 {
    if (I915_DBG_ON(flags)) {
@@ -67,7 +67,7 @@ I915_DBG(unsigned flags, const char *fmt, ...)
 }
 #else
 #define I915_DBG_ON(flags) (0)
-static INLINE void I915_DBG(unsigned flags, const char *fmt, ...) {}
+static inline void I915_DBG(unsigned flags, const char *fmt, ...) {}
 #endif
 
 void i915_debug_init(struct i915_screen *i915);
diff --git a/src/gallium/drivers/i915/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h
index a4dbcb4d271..adc42542fea 100644
--- a/src/gallium/drivers/i915/i915_fpc.h
+++ b/src/gallium/drivers/i915/i915_fpc.h
@@ -136,7 +136,7 @@ struct i915_fp_compile {
 
 /* One neat thing about the UREG representation:  
  */
-static INLINE int
+static inline int
 swizzle(int reg, uint x, uint y, uint z, uint w)
 {
    assert(x <= SRC_ONE);
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 38a33888166..456be9d92ca 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -111,7 +111,7 @@ static const float cos_constants[4] = { 1.0,
 /**
  * component-wise negation of ureg
  */
-static INLINE int
+static inline int
 negate(int reg, int x, int y, int z, int w)
 {
    /* Another neat thing about the UREG representation */
diff --git a/src/gallium/drivers/i915/i915_prim_emit.c b/src/gallium/drivers/i915/i915_prim_emit.c
index 248e21e02da..ea84efd1d17 100644
--- a/src/gallium/drivers/i915/i915_prim_emit.c
+++ b/src/gallium/drivers/i915/i915_prim_emit.c
@@ -53,7 +53,7 @@ struct setup_stage {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+static inline struct setup_stage *setup_stage( struct draw_stage *stage )
 {
    return (struct setup_stage *)stage;
 }
@@ -65,7 +65,7 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
  * have a couple of slots at the beginning (1-dword header, 4-dword
  * clip pos) that we ignore here.
  */
-static INLINE void
+static inline void
 emit_hw_vertex( struct i915_context *i915,
                 const struct vertex_header *vertex)
 {
@@ -124,7 +124,7 @@ emit_hw_vertex( struct i915_context *i915,
 
 
 
-static INLINE void 
+static inline void 
 emit_prim( struct draw_stage *stage, 
 	   struct prim_header *prim,
 	   unsigned hwprim,
diff --git a/src/gallium/drivers/i915/i915_prim_vbuf.c b/src/gallium/drivers/i915/i915_prim_vbuf.c
index d134dbb1620..8f61f151e0c 100644
--- a/src/gallium/drivers/i915/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915/i915_prim_vbuf.c
@@ -96,7 +96,7 @@ struct i915_vbuf_render {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct i915_vbuf_render *
+static inline struct i915_vbuf_render *
 i915_vbuf_render(struct vbuf_render *render)
 {
    assert(render);
diff --git a/src/gallium/drivers/i915/i915_resource.h b/src/gallium/drivers/i915/i915_resource.h
index ef99cfb5d3c..77fe8b70f79 100644
--- a/src/gallium/drivers/i915/i915_resource.h
+++ b/src/gallium/drivers/i915/i915_resource.h
@@ -94,14 +94,14 @@ void i915_init_resource_functions(struct i915_context *i915);
 extern struct u_resource_vtbl i915_buffer_vtbl;
 extern struct u_resource_vtbl i915_texture_vtbl;
 
-static INLINE struct i915_texture *i915_texture(struct pipe_resource *resource)
+static inline struct i915_texture *i915_texture(struct pipe_resource *resource)
 {
    struct i915_texture *tex = (struct i915_texture *)resource;
    assert(tex->b.vtbl == &i915_texture_vtbl);
    return tex;
 }
 
-static INLINE struct i915_buffer *i915_buffer(struct pipe_resource *resource)
+static inline struct i915_buffer *i915_buffer(struct pipe_resource *resource)
 {
    struct i915_buffer *tex = (struct i915_buffer *)resource;
    assert(tex->b.vtbl == &i915_buffer_vtbl);
diff --git a/src/gallium/drivers/i915/i915_resource_texture.c b/src/gallium/drivers/i915/i915_resource_texture.c
index 8ef73d6f2c2..9a3279ccb75 100644
--- a/src/gallium/drivers/i915/i915_resource_texture.c
+++ b/src/gallium/drivers/i915/i915_resource_texture.c
@@ -89,25 +89,25 @@ static const int bottom_offsets[6] = {
    [PIPE_TEX_FACE_NEG_Z] = 16 + 5 * 8,
 };
 
-static INLINE unsigned
+static inline unsigned
 align_nblocksx(enum pipe_format format, unsigned width, unsigned align_to)
 {
    return align(util_format_get_nblocksx(format, width), align_to);
 }
 
-static INLINE unsigned
+static inline unsigned
 align_nblocksy(enum pipe_format format, unsigned width, unsigned align_to)
 {
    return align(util_format_get_nblocksy(format, width), align_to);
 }
 
-static INLINE unsigned
+static inline unsigned
 get_pot_stride(enum pipe_format format, unsigned width)
 {
    return util_next_power_of_two(util_format_get_stride(format, width));
 }
 
-static INLINE const char*
+static inline const char*
 get_tiling_string(enum i915_winsys_buffer_tile tile)
 {
    switch(tile) {
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 0590da07b9a..19a94a8e019 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -243,6 +243,10 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -462,15 +466,6 @@ i915_fence_reference(struct pipe_screen *screen,
    is->iws->fence_reference(is->iws, ptr, fence);
 }
 
-static boolean
-i915_fence_signalled(struct pipe_screen *screen,
-                     struct pipe_fence_handle *fence)
-{
-   struct i915_screen *is = i915_screen(screen);
-
-   return is->iws->fence_signalled(is->iws, fence) == 1;
-}
-
 static boolean
 i915_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
@@ -478,6 +473,9 @@ i915_fence_finish(struct pipe_screen *screen,
 {
    struct i915_screen *is = i915_screen(screen);
 
+   if (!timeout)
+      return is->iws->fence_signalled(is->iws, fence) == 1;
+
    return is->iws->fence_finish(is->iws, fence) == 1;
 }
 
@@ -565,7 +563,6 @@ i915_screen_create(struct i915_winsys *iws)
    is->base.context_create = i915_create_context;
 
    is->base.fence_reference = i915_fence_reference;
-   is->base.fence_signalled = i915_fence_signalled;
    is->base.fence_finish = i915_fence_finish;
 
    i915_init_screen_resource_functions(is);
diff --git a/src/gallium/drivers/i915/i915_screen.h b/src/gallium/drivers/i915/i915_screen.h
index 99d3ffd3af9..3be941a1561 100644
--- a/src/gallium/drivers/i915/i915_screen.h
+++ b/src/gallium/drivers/i915/i915_screen.h
@@ -59,7 +59,7 @@ struct i915_screen
  */
 
 
-static INLINE struct i915_screen *
+static inline struct i915_screen *
 i915_screen(struct pipe_screen *pscreen)
 {
    return (struct i915_screen *) pscreen;
diff --git a/src/gallium/drivers/i915/i915_state_dynamic.c b/src/gallium/drivers/i915/i915_state_dynamic.c
index 4050cd4ac44..1c29e8ae671 100644
--- a/src/gallium/drivers/i915/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915/i915_state_dynamic.c
@@ -46,7 +46,7 @@
  * (active) state every time a 4kb boundary is crossed.
  */
 
-static INLINE void set_dynamic(struct i915_context *i915,
+static inline void set_dynamic(struct i915_context *i915,
                                unsigned offset,
                                const unsigned state)
 {
@@ -60,7 +60,7 @@ static INLINE void set_dynamic(struct i915_context *i915,
 
 
 
-static INLINE void set_dynamic_array(struct i915_context *i915,
+static inline void set_dynamic_array(struct i915_context *i915,
                                      unsigned offset,
                                      const unsigned *src,
                                      unsigned dwords)
diff --git a/src/gallium/drivers/i915/i915_state_immediate.c b/src/gallium/drivers/i915/i915_state_immediate.c
index d244a349fce..c4a6cae1beb 100644
--- a/src/gallium/drivers/i915/i915_state_immediate.c
+++ b/src/gallium/drivers/i915/i915_state_immediate.c
@@ -39,7 +39,7 @@
 /* Convinience function to check immediate state.
  */
 
-static INLINE void set_immediate(struct i915_context *i915,
+static inline void set_immediate(struct i915_context *i915,
                                  unsigned offset,
                                  const unsigned state)
 {
diff --git a/src/gallium/drivers/i915/i915_state_inlines.h b/src/gallium/drivers/i915/i915_state_inlines.h
index d4c5ab69555..015ea32933b 100644
--- a/src/gallium/drivers/i915/i915_state_inlines.h
+++ b/src/gallium/drivers/i915/i915_state_inlines.h
@@ -34,7 +34,7 @@
 #include "i915_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_compare_func(unsigned func)
 {
    switch (func) {
@@ -59,7 +59,7 @@ i915_translate_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_shadow_compare_func(unsigned func)
 {
    switch (func) {
@@ -84,7 +84,7 @@ i915_translate_shadow_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_stencil_op(unsigned op)
 {
    switch (op) {
@@ -109,7 +109,7 @@ i915_translate_stencil_op(unsigned op)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_blend_factor(unsigned factor)
 {
    switch (factor) {
@@ -148,7 +148,7 @@ i915_translate_blend_factor(unsigned factor)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_blend_func(unsigned mode)
 {
    switch (mode) {
@@ -168,7 +168,7 @@ i915_translate_blend_func(unsigned mode)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 i915_translate_logic_op(unsigned opcode)
 {
    switch (opcode) {
@@ -211,7 +211,7 @@ i915_translate_logic_op(unsigned opcode)
 
 
 
-static INLINE boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
+static inline boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
 {
    boolean ok;
 
diff --git a/src/gallium/drivers/ilo/Makefile.am b/src/gallium/drivers/ilo/Makefile.am
index a8785a5e8c4..1f14153748e 100644
--- a/src/gallium/drivers/ilo/Makefile.am
+++ b/src/gallium/drivers/ilo/Makefile.am
@@ -21,8 +21,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/ilo/Makefile.sources b/src/gallium/drivers/ilo/Makefile.sources
index e1bbb9a0781..7a7db938f92 100644
--- a/src/gallium/drivers/ilo/Makefile.sources
+++ b/src/gallium/drivers/ilo/Makefile.sources
@@ -1,5 +1,4 @@
 C_SOURCES := \
-	core/ilo_buffer.h \
 	core/ilo_builder.c \
 	core/ilo_builder.h \
 	core/ilo_builder_3d.h \
@@ -43,6 +42,7 @@ C_SOURCES := \
 	core/ilo_state_viewport.h \
 	core/ilo_state_zs.c \
 	core/ilo_state_zs.h \
+	core/ilo_vma.h \
 	core/intel_winsys.h \
 	ilo_blit.c \
 	ilo_blit.h \
@@ -65,8 +65,6 @@ C_SOURCES := \
 	ilo_public.h \
 	ilo_query.c \
 	ilo_query.h \
-	ilo_resource.c \
-	ilo_resource.h \
 	ilo_render.c \
 	ilo_render.h \
 	ilo_render_gen.h \
@@ -76,6 +74,8 @@ C_SOURCES := \
 	ilo_render_gen8.c \
 	ilo_render_media.c \
 	ilo_render_surface.c \
+	ilo_resource.c \
+	ilo_resource.h \
 	ilo_screen.c \
 	ilo_screen.h \
 	ilo_shader.c \
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
index 6d9e3699125..5efe9da2d22 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
@@ -39,6 +39,7 @@
 #include "ilo_state_shader.h"
 #include "ilo_state_viewport.h"
 #include "ilo_state_zs.h"
+#include "ilo_vma.h"
 #include "ilo_builder.h"
 #include "ilo_builder_3d_top.h"
 
@@ -674,9 +675,10 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[5] |= builder->mocs << GEN8_DEPTH_DW5_MOCS__SHIFT;
 
-      if (zs->depth_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->depth_bo,
-               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->z_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->z_vma->bo,
+               zs->z_vma->bo_offset + zs->depth[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->depth[0];
@@ -691,9 +693,10 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
       else
          dw[6] |= builder->mocs << GEN6_DEPTH_DW6_MOCS__SHIFT;
 
-      if (zs->depth_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->depth_bo,
-               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->z_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->z_vma->bo,
+               zs->z_vma->bo_offset + zs->depth[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
@@ -724,9 +727,10 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT;
 
-      if (zs->stencil_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->stencil_bo,
-               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->s_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->s_vma->bo,
+               zs->s_vma->bo_offset + zs->stencil[1],
+               (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->stencil[0];
@@ -734,9 +738,10 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN6_STENCIL_DW1_MOCS__SHIFT;
 
-      if (zs->stencil_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->stencil_bo,
-               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->s_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->s_vma->bo,
+               zs->s_vma->bo_offset + zs->stencil[1],
+               (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
@@ -767,9 +772,10 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT;
 
-      if (zs->hiz_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_bo,
-               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->hiz_vma) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_vma->bo,
+               zs->hiz_vma->bo_offset + zs->hiz[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       dw[1] = zs->hiz[0];
@@ -777,9 +783,10 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
 
       dw[1] |= builder->mocs << GEN6_HIZ_DW1_MOCS__SHIFT;
 
-      if (zs->hiz_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_bo,
-               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
+      if (zs->hiz_vma) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_vma->bo,
+               zs->hiz_vma->bo_offset + zs->hiz[1],
+               (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
index 8d30095e6f6..6e94fb25f1f 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
@@ -39,6 +39,7 @@
 #include "ilo_state_surface.h"
 #include "ilo_state_urb.h"
 #include "ilo_state_vf.h"
+#include "ilo_vma.h"
 #include "ilo_builder.h"
 
 static inline void
@@ -318,8 +319,10 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
       dw[3] = 0;
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         if (b->need_bo)
-            ilo_builder_batch_reloc64(builder, pos + 1, b->bo, b->vb[1], 0);
+         if (b->vma) {
+            ilo_builder_batch_reloc64(builder, pos + 1, b->vma->bo,
+                  b->vma->bo_offset + b->vb[1], 0);
+         }
 
          dw[3] |= b->vb[2];
       } else {
@@ -331,9 +334,11 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
             dw[3] |= vf->user_instancing[elem][1];
          }
 
-         if (b->need_bo) {
-            ilo_builder_batch_reloc(builder, pos + 1, b->bo, b->vb[1], 0);
-            ilo_builder_batch_reloc(builder, pos + 2, b->bo, b->vb[2], 0);
+         if (b->vma) {
+            ilo_builder_batch_reloc(builder, pos + 1, b->vma->bo,
+                  b->vma->bo_offset + b->vb[1], 0);
+            ilo_builder_batch_reloc(builder, pos + 2, b->vma->bo,
+                  b->vma->bo_offset + b->vb[2], 0);
          }
       }
 
@@ -429,9 +434,11 @@ gen6_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = dw0;
-   if (ib->need_bo) {
-      ilo_builder_batch_reloc(builder, pos + 1, ib->bo, ib->ib[1], 0);
-      ilo_builder_batch_reloc(builder, pos + 2, ib->bo, ib->ib[2], 0);
+   if (ib->vma) {
+      ilo_builder_batch_reloc(builder, pos + 1, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[1], 0);
+      ilo_builder_batch_reloc(builder, pos + 2, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[2], 0);
    } else {
       dw[1] = 0;
       dw[2] = 0;
@@ -456,8 +463,9 @@ gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
    dw[1] = ib->ib[0] |
            builder->mocs << GEN8_IB_DW1_MOCS__SHIFT;
 
-   if (ib->need_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 2, ib->bo, ib->ib[1], 0);
+   if (ib->vma) {
+      ilo_builder_batch_reloc64(builder, pos + 2, ib->vma->bo,
+            ib->vma->bo_offset + ib->ib[1], 0);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -801,11 +809,11 @@ gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
            builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT |
            sol->strides[buffer] << GEN7_SO_BUF_DW1_PITCH__SHIFT;
 
-   if (sb->need_bo) {
-      ilo_builder_batch_reloc(builder, pos + 2, sb->bo,
-            sb->so_buf[0], INTEL_RELOC_WRITE);
-      ilo_builder_batch_reloc(builder, pos + 3, sb->bo,
-            sb->so_buf[1], INTEL_RELOC_WRITE);
+   if (sb->vma) {
+      ilo_builder_batch_reloc(builder, pos + 2, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[0], INTEL_RELOC_WRITE);
+      ilo_builder_batch_reloc(builder, pos + 3, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[1], INTEL_RELOC_WRITE);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -832,9 +840,9 @@ gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
            buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT |
            builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT;
 
-   if (sb->need_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 2, sb->bo,
-            sb->so_buf[1], INTEL_RELOC_WRITE);
+   if (sb->vma) {
+      ilo_builder_batch_reloc64(builder, pos + 2, sb->vma->bo,
+            sb->vma->bo_offset + sb->so_buf[1], INTEL_RELOC_WRITE);
    } else {
       dw[2] = 0;
       dw[3] = 0;
@@ -842,9 +850,10 @@ gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
 
    dw[4] = sb->so_buf[2];
 
-   if (sb->need_write_offset_bo) {
-      ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_bo,
-            sizeof(uint32_t) * buffer, INTEL_RELOC_WRITE);
+   if (sb->write_offset_vma) {
+      ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_vma->bo,
+            sb->write_offset_vma->bo_offset + sizeof(uint32_t) * buffer,
+            INTEL_RELOC_WRITE);
    } else {
       dw[5] = 0;
       dw[6] = 0;
@@ -1254,14 +1263,15 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
       memcpy(dw, surf->surface, state_len << 2);
 
-      if (surf->bo) {
+      if (surf->vma) {
          const uint32_t mocs = (surf->scanout) ?
             (GEN8_MOCS_MT_PTE | GEN8_MOCS_CT_L3) : builder->mocs;
 
          dw[1] |= mocs << GEN8_SURFACE_DW1_MOCS__SHIFT;
 
-         ilo_builder_surface_reloc64(builder, state_offset, 8, surf->bo,
-               surf->surface[8], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
+         ilo_builder_surface_reloc64(builder, state_offset, 8, surf->vma->bo,
+               surf->vma->bo_offset + surf->surface[8],
+               (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       state_align = 32;
@@ -1271,15 +1281,16 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
       memcpy(dw, surf->surface, state_len << 2);
 
-      if (surf->bo) {
+      if (surf->vma) {
          /*
           * For scanouts, we should not enable caching in LLC.  Since we only
           * enable that on Gen8+, we are fine here.
           */
          dw[5] |= builder->mocs << GEN6_SURFACE_DW5_MOCS__SHIFT;
 
-         ilo_builder_surface_reloc(builder, state_offset, 1, surf->bo,
-               surf->surface[1], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
+         ilo_builder_surface_reloc(builder, state_offset, 1, surf->vma->bo,
+               surf->vma->bo_offset + surf->surface[1],
+               (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 
diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h
index 0a7f7d9d3fe..da7db90a54b 100644
--- a/src/gallium/drivers/ilo/core/ilo_core.h
+++ b/src/gallium/drivers/ilo/core/ilo_core.h
@@ -29,15 +29,9 @@
 #define ILO_CORE_H
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_format.h"
 
 #include "util/u_debug.h"
-#include "util/list.h"
-#include "util/u_format.h"
-#include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_pointer.h"
 
 #endif /* ILO_CORE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 0d837d8a9d5..fa547ac5c36 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -40,269 +40,356 @@ enum {
                         IMAGE_TILING_W)
 };
 
-struct ilo_image_params {
-   const struct ilo_dev *dev;
-   const struct pipe_resource *templ;
-   unsigned valid_tilings;
+struct ilo_image_layout {
+   enum ilo_image_walk_type walk;
+   bool interleaved_samples;
 
-   bool compressed;
+   uint8_t valid_tilings;
+   enum gen_surface_tiling tiling;
 
-   unsigned h0, h1;
-   unsigned max_x, max_y;
+   enum ilo_image_aux_type aux;
+
+   int align_i;
+   int align_j;
+
+   struct ilo_image_lod *lods;
+   int walk_layer_h0;
+   int walk_layer_h1;
+   int walk_layer_height;
+   int monolithic_width;
+   int monolithic_height;
 };
 
-static void
-img_get_slice_size(const struct ilo_image *img,
-                   const struct ilo_image_params *params,
-                   unsigned level, unsigned *width, unsigned *height)
+static enum ilo_image_walk_type
+image_get_gen6_walk(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
 {
-   const struct pipe_resource *templ = params->templ;
-   unsigned w, h;
+   ILO_DEV_ASSERT(dev, 6, 6);
 
-   w = u_minify(img->width0, level);
-   h = u_minify(img->height0, level);
-
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 114:
-    *
-    *     "The dimensions of the mip maps are first determined by applying the
-    *      sizing algorithm presented in Non-Power-of-Two Mipmaps above. Then,
-    *      if necessary, they are padded out to compression block boundaries."
-    */
-   w = align(w, img->block_width);
-   h = align(h, img->block_height);
-
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 111:
-    *
-    *     "If the surface is multisampled (4x), these values must be adjusted
-    *      as follows before proceeding:
-    *
-    *        W_L = ceiling(W_L / 2) * 4
-    *        H_L = ceiling(H_L / 2) * 4"
-    *
-    * From the Ivy Bridge PRM, volume 1 part 1, page 108:
-    *
-    *     "If the surface is multisampled and it is a depth or stencil surface
-    *      or Multisampled Surface StorageFormat in SURFACE_STATE is
-    *      MSFMT_DEPTH_STENCIL, W_L and H_L must be adjusted as follows before
-    *      proceeding:
-    *
-    *        #samples  W_L =                    H_L =
-    *        2         ceiling(W_L / 2) * 4     HL [no adjustment]
-    *        4         ceiling(W_L / 2) * 4     ceiling(H_L / 2) * 4
-    *        8         ceiling(W_L / 2) * 8     ceiling(H_L / 2) * 4
-    *        16        ceiling(W_L / 2) * 8     ceiling(H_L / 2) * 8"
-    *
-    * For interleaved samples (4x), where pixels
-    *
-    *   (x, y  ) (x+1, y  )
-    *   (x, y+1) (x+1, y+1)
-    *
-    * would be is occupied by
-    *
-    *   (x, y  , si0) (x+1, y  , si0) (x, y  , si1) (x+1, y  , si1)
-    *   (x, y+1, si0) (x+1, y+1, si0) (x, y+1, si1) (x+1, y+1, si1)
-    *   (x, y  , si2) (x+1, y  , si2) (x, y  , si3) (x+1, y  , si3)
-    *   (x, y+1, si2) (x+1, y+1, si2) (x, y+1, si3) (x+1, y+1, si3)
-    *
-    * Thus the need to
-    *
-    *   w = align(w, 2) * 2;
-    *   y = align(y, 2) * 2;
-    */
-   if (img->interleaved_samples) {
-      switch (templ->nr_samples) {
-      case 0:
-      case 1:
-         break;
-      case 2:
-         w = align(w, 2) * 2;
-         break;
-      case 4:
-         w = align(w, 2) * 2;
-         h = align(h, 2) * 2;
-         break;
-      case 8:
-         w = align(w, 2) * 4;
-         h = align(h, 2) * 2;
-         break;
-      case 16:
-         w = align(w, 2) * 4;
-         h = align(h, 2) * 4;
-         break;
-      default:
-         assert(!"unsupported sample count");
-         break;
-      }
-   }
-
-   /*
-    * From the Ivy Bridge PRM, volume 1 part 1, page 108:
-    *
-    *     "For separate stencil buffer, the width must be mutiplied by 2 and
-    *      height divided by 2..."
-    *
-    * To make things easier (for transfer), we will just double the stencil
-    * stride in 3DSTATE_STENCIL_BUFFER.
-    */
-   w = align(w, img->align_i);
-   h = align(h, img->align_j);
-
-   *width = w;
-   *height = h;
-}
-
-static unsigned
-img_get_num_layers(const struct ilo_image *img,
-                   const struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-   unsigned num_layers = templ->array_size;
-
-   /* samples of the same index are stored in a layer */
-   if (templ->nr_samples > 1 && !img->interleaved_samples)
-      num_layers *= templ->nr_samples;
-
-   return num_layers;
-}
-
-static void
-img_init_layer_height(struct ilo_image *img,
-                      struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-   unsigned num_layers;
-
-   if (img->walk != ILO_IMAGE_WALK_LAYER)
-      return;
-
-   num_layers = img_get_num_layers(img, params);
-   if (num_layers <= 1)
-      return;
+   /* TODO we want LODs to be page-aligned */
+   if (info->type == GEN6_SURFTYPE_3D)
+      return ILO_IMAGE_WALK_3D;
 
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 115:
     *
-    *     "The following equation is used for surface formats other than
-    *      compressed textures:
+    *     "The separate stencil buffer does not support mip mapping, thus the
+    *      storage for LODs other than LOD 0 is not needed. The following
+    *      QPitch equation applies only to the separate stencil buffer:
     *
-    *        QPitch = (h0 + h1 + 11j)"
+    *        QPitch = h_0"
     *
-    *     "The equation for compressed textures (BC* and FXT1 surface formats)
-    *      follows:
-    *
-    *        QPitch = (h0 + h1 + 11j) / 4"
-    *
-    *     "[DevSNB] Errata: Sampler MSAA Qpitch will be 4 greater than the
-    *      value calculated in the equation above, for every other odd Surface
-    *      Height starting from 1 i.e. 1,5,9,13"
-    *
-    * From the Ivy Bridge PRM, volume 1 part 1, page 111-112:
-    *
-    *     "If Surface Array Spacing is set to ARYSPC_FULL (note that the depth
-    *      buffer and stencil buffer have an implied value of ARYSPC_FULL):
-    *
-    *        QPitch = (h0 + h1 + 12j)
-    *        QPitch = (h0 + h1 + 12j) / 4 (compressed)
-    *
-    *      (There are many typos or missing words here...)"
-    *
-    * To access the N-th slice, an offset of (Stride * QPitch * N) is added to
-    * the base address.  The PRM divides QPitch by 4 for compressed formats
-    * because the block height for those formats are 4, and it wants QPitch to
-    * mean the number of memory rows, as opposed to texel rows, between
-    * slices.  Since we use texel rows everywhere, we do not need to divide
-    * QPitch by 4.
+    * Use ILO_IMAGE_WALK_LOD and manually offset to the (page-aligned) levels
+    * when bound.
     */
-   img->walk_layer_height = params->h0 + params->h1 +
-      ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * img->align_j;
+   if (info->bind_zs && info->format == GEN6_FORMAT_R8_UINT)
+      return ILO_IMAGE_WALK_LOD;
 
-   if (ilo_dev_gen(params->dev) == ILO_GEN(6) && templ->nr_samples > 1 &&
-       img->height0 % 4 == 1)
-      img->walk_layer_height += 4;
-
-   params->max_y += img->walk_layer_height * (num_layers - 1);
+   /* compact spacing is not supported otherwise */
+   return ILO_IMAGE_WALK_LAYER;
 }
 
-static void
-img_init_lods(struct ilo_image *img,
-              struct ilo_image_params *params)
+static enum ilo_image_walk_type
+image_get_gen7_walk(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
 {
-   const struct pipe_resource *templ = params->templ;
-   unsigned cur_x, cur_y;
-   unsigned lv;
+   ILO_DEV_ASSERT(dev, 7, 8);
 
-   cur_x = 0;
-   cur_y = 0;
-   for (lv = 0; lv <= templ->last_level; lv++) {
-      unsigned lod_w, lod_h;
+   if (info->type == GEN6_SURFTYPE_3D)
+      return ILO_IMAGE_WALK_3D;
 
-      img_get_slice_size(img, params, lv, &lod_w, &lod_h);
+   /*
+    * From the Ivy Bridge PRM, volume 1 part 1, page 111:
+    *
+    *     "note that the depth buffer and stencil buffer have an implied value
+    *      of ARYSPC_FULL"
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 66:
+    *
+    *     "If Multisampled Surface Storage Format is MSFMT_MSS and Number of
+    *      Multisamples is not MULTISAMPLECOUNT_1, this field (Surface Array
+    *      Spacing) must be set to ARYSPC_LOD0."
+    */
+   if (info->sample_count > 1)
+      assert(info->level_count == 1);
+   return (info->bind_zs || info->level_count > 1) ?
+      ILO_IMAGE_WALK_LAYER : ILO_IMAGE_WALK_LOD;
+}
 
-      img->lods[lv].x = cur_x;
-      img->lods[lv].y = cur_y;
-      img->lods[lv].slice_width = lod_w;
-      img->lods[lv].slice_height = lod_h;
+static bool
+image_get_gen6_interleaved_samples(const struct ilo_dev *dev,
+                                   const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-      switch (img->walk) {
-      case ILO_IMAGE_WALK_LAYER:
-         /* MIPLAYOUT_BELOW */
-         if (lv == 1)
-            cur_x += lod_w;
-         else
-            cur_y += lod_h;
-         break;
-      case ILO_IMAGE_WALK_LOD:
-         lod_h *= img_get_num_layers(img, params);
-         if (lv == 1)
-            cur_x += lod_w;
-         else
-            cur_y += lod_h;
+   /*
+    * Gen6 supports only interleaved samples.  It is not explicitly stated,
+    * but on Gen7+, render targets are expected to be UMS/CMS (samples
+    * non-interleaved) and depth/stencil buffers are expected to be IMS
+    * (samples interleaved).
+    *
+    * See "Multisampled Surface Storage Format" field of SURFACE_STATE.
+    */
+   return (ilo_dev_gen(dev) == ILO_GEN(6) || info->bind_zs);
+}
 
-         /* every LOD begins at tile boundaries */
-         if (templ->last_level > 0) {
-            assert(img->format == PIPE_FORMAT_S8_UINT);
-            cur_x = align(cur_x, 64);
-            cur_y = align(cur_y, 64);
-         }
-         break;
-      case ILO_IMAGE_WALK_3D:
-         {
-            const unsigned num_slices = u_minify(templ->depth0, lv);
-            const unsigned num_slices_per_row = 1 << lv;
-            const unsigned num_rows =
-               (num_slices + num_slices_per_row - 1) / num_slices_per_row;
+static uint8_t
+image_get_gen6_valid_tilings(const struct ilo_dev *dev,
+                             const struct ilo_image_info *info)
+{
+   uint8_t valid_tilings = IMAGE_TILING_ALL;
 
-            lod_w *= num_slices_per_row;
-            lod_h *= num_rows;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-            cur_y += lod_h;
-         }
-         break;
-      }
+   if (info->valid_tilings)
+      valid_tilings &= info->valid_tilings;
 
-      if (params->max_x < img->lods[lv].x + lod_w)
-         params->max_x = img->lods[lv].x + lod_w;
-      if (params->max_y < img->lods[lv].y + lod_h)
-         params->max_y = img->lods[lv].y + lod_h;
-   }
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 2, page 32:
+    *
+    *     "Display/Overlay   Y-Major not supported.
+    *                        X-Major required for Async Flips"
+    */
+   if (unlikely(info->bind_scanout))
+      valid_tilings &= IMAGE_TILING_X;
 
-   if (img->walk == ILO_IMAGE_WALK_LAYER) {
-      params->h0 = img->lods[0].slice_height;
+   /*
+    * From the Sandy Bridge PRM, volume 3 part 2, page 158:
+    *
+    *     "The cursor surface address must be 4K byte aligned. The cursor must
+    *      be in linear memory, it cannot be tiled."
+    */
+   if (unlikely(info->bind_cursor))
+      valid_tilings &= IMAGE_TILING_NONE;
 
-      if (templ->last_level > 0)
-         params->h1 = img->lods[1].slice_height;
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 318:
+    *
+    *     "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear
+    *      Depth Buffer is not supported."
+    *
+    *     "The Depth Buffer, if tiled, must use Y-Major tiling."
+    *
+    * From the Sandy Bridge PRM, volume 1 part 2, page 22:
+    *
+    *     "W-Major Tile Format is used for separate stencil."
+    */
+   if (info->bind_zs) {
+      if (info->format == GEN6_FORMAT_R8_UINT)
+         valid_tilings &= IMAGE_TILING_W;
       else
-         img_get_slice_size(img, params, 1, &cur_x, &params->h1);
+         valid_tilings &= IMAGE_TILING_Y;
+   }
+
+   if (info->bind_surface_sampler ||
+       info->bind_surface_dp_render ||
+       info->bind_surface_dp_typed) {
+      /*
+       * From the Haswell PRM, volume 2d, page 233:
+       *
+       *     "If Number of Multisamples is not MULTISAMPLECOUNT_1, this field
+       *      (Tiled Surface) must be TRUE."
+       */
+      if (info->sample_count > 1)
+         valid_tilings &= ~IMAGE_TILING_NONE;
+
+      if (ilo_dev_gen(dev) < ILO_GEN(8))
+         valid_tilings &= ~IMAGE_TILING_W;
+   }
+
+   if (info->bind_surface_dp_render) {
+      /*
+       * From the Sandy Bridge PRM, volume 1 part 2, page 32:
+       *
+       *     "NOTE: 128BPE Format Color buffer ( render target ) MUST be
+       *      either TileX or Linear."
+       *
+       * From the Haswell PRM, volume 5, page 32:
+       *
+       *     "NOTE: 128 BPP format color buffer (render target) supports
+       *      Linear, TiledX and TiledY."
+       */
+      if (ilo_dev_gen(dev) < ILO_GEN(7.5) && info->block_size == 16)
+         valid_tilings &= ~IMAGE_TILING_Y;
+
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 63:
+       *
+       *     "This field (Surface Vertical Aligment) must be set to VALIGN_4
+       *      for all tiled Y Render Target surfaces."
+       *
+       *     "VALIGN_4 is not supported for surface format R32G32B32_FLOAT."
+       *
+       * R32G32B32_FLOAT is not renderable and we only need an assert() here.
+       */
+      if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
+         assert(info->format != GEN6_FORMAT_R32G32B32_FLOAT);
+   }
+
+   return valid_tilings;
+}
+
+static uint64_t
+image_get_gen6_estimated_size(const struct ilo_dev *dev,
+                              const struct ilo_image_info *info)
+{
+   /* padding not considered */
+   const uint64_t slice_size = info->width * info->height *
+      info->block_size / (info->block_width * info->block_height);
+   const uint64_t slice_count =
+      info->depth * info->array_size * info->sample_count;
+   const uint64_t estimated_size = slice_size * slice_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->level_count == 1)
+      return estimated_size;
+   else
+      return estimated_size * 4 / 3;
+}
+
+static enum gen_surface_tiling
+image_get_gen6_tiling(const struct ilo_dev *dev,
+                      const struct ilo_image_info *info,
+                      uint8_t valid_tilings)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (valid_tilings) {
+   case IMAGE_TILING_NONE:
+      return GEN6_TILING_NONE;
+   case IMAGE_TILING_X:
+      return GEN6_TILING_X;
+   case IMAGE_TILING_Y:
+      return GEN6_TILING_Y;
+   case IMAGE_TILING_W:
+      return GEN8_TILING_W;
+   default:
+      break;
+   }
+
+   /*
+    * X-tiling has the property that vertically adjacent pixels are usually in
+    * the same page.  When the image size is less than a page, the image
+    * height is 1, or when the image is not accessed in blocks, there is no
+    * reason to tile.
+    *
+    * Y-tiling is similar, where vertically adjacent pixels are usually in the
+    * same cacheline.
+    */
+   if (valid_tilings & IMAGE_TILING_NONE) {
+      const uint64_t estimated_size =
+         image_get_gen6_estimated_size(dev, info);
+
+      if (info->height == 1 || !(info->bind_surface_sampler ||
+                                 info->bind_surface_dp_render ||
+                                 info->bind_surface_dp_typed))
+         return GEN6_TILING_NONE;
+
+      if (estimated_size <= 64 ||
+          estimated_size > info->prefer_linear_threshold)
+         return GEN6_TILING_NONE;
+
+      if (estimated_size <= 2048)
+         valid_tilings &= ~IMAGE_TILING_X;
+   }
+
+   return (valid_tilings & IMAGE_TILING_Y) ? GEN6_TILING_Y :
+          (valid_tilings & IMAGE_TILING_X) ? GEN6_TILING_X :
+          GEN6_TILING_NONE;
+}
+
+static bool
+image_get_gen6_hiz_enable(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* depth buffer? */
+   if (!info->bind_zs ||
+       info->format == GEN6_FORMAT_R8_UINT ||
+       info->interleaved_stencil)
+      return false;
+
+   /* we want to be able to force 8x4 alignments */
+   if (info->type == GEN6_SURFTYPE_1D)
+      return false;
+
+   if (info->aux_disable)
+      return false;
+
+   if (ilo_debug & ILO_DEBUG_NOHIZ)
+      return false;
+
+   return true;
+}
+
+static bool
+image_get_gen7_mcs_enable(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          enum gen_surface_tiling tiling)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!info->bind_surface_sampler && !info->bind_surface_dp_render)
+      return false;
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 77:
+    *
+    *     "For Render Target and Sampling Engine Surfaces:If the surface is
+    *      multisampled (Number of Multisamples any value other than
+    *      MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled."
+    *
+    *     "This field must be set to 0 for all SINT MSRTs when all RT channels
+    *      are not written"
+    */
+   if (info->sample_count > 1) {
+      if (ilo_dev_gen(dev) < ILO_GEN(8))
+         assert(!info->is_integer);
+      return true;
+   }
+
+   if (info->aux_disable)
+      return false;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 326:
+    *
+    *     "When MCS is buffer is used for color clear of non-multisampler
+    *      render target, the following restrictions apply.
+    *      - Support is limited to tiled render targets.
+    *      - Support is for non-mip-mapped and non-array surface types only.
+    *      - Clear is supported only on the full RT; i.e., no partial clear or
+    *        overlapping clears.
+    *      - MCS buffer for non-MSRT is supported only for RT formats 32bpp,
+    *        64bpp and 128bpp.
+    *      ..."
+    *
+    * How about SURFTYPE_3D?
+    */
+   if (!info->bind_surface_dp_render ||
+       tiling == GEN6_TILING_NONE ||
+       info->level_count > 1 ||
+       info->array_size > 1)
+      return false;
+
+   switch (info->block_size) {
+   case 4:
+   case 8:
+   case 16:
+      return true;
+   default:
+      return false;
    }
 }
 
 static void
-img_init_alignments(struct ilo_image *img,
-                    const struct ilo_image_params *params)
+image_get_gen6_alignments(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          int *align_i, int *align_j)
 {
-   const struct pipe_resource *templ = params->templ;
+   ILO_DEV_ASSERT(dev, 6, 6);
 
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 113:
@@ -335,13 +422,33 @@ img_init_alignments(struct ilo_image *img,
     *
     *                                  align_i        align_j
     *   compressed formats             block width    block height
-    *   PIPE_FORMAT_S8_UINT            4              2
+    *   GEN6_FORMAT_R8_UINT            4              2
     *   other depth/stencil formats    4              4
     *   4x multisampled                4              4
     *   bpp 96                         4              2
     *   others                         4              2 or 4
     */
 
+   *align_i = (info->compressed) ? info->block_width : 4;
+   if (info->compressed) {
+      *align_j = info->block_height;
+   } else if (info->bind_zs) {
+      *align_j = (info->format == GEN6_FORMAT_R8_UINT) ? 2 : 4;
+   } else {
+      *align_j = (info->sample_count > 1 || info->block_size != 12) ? 4 : 2;
+   }
+}
+
+static void
+image_get_gen7_alignments(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          enum gen_surface_tiling tiling,
+                          int *align_i, int *align_j)
+{
+   int i, j;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
    /*
     * From the Ivy Bridge PRM, volume 1 part 1, page 110:
     *
@@ -383,465 +490,301 @@ img_init_alignments(struct ilo_image *img,
     *
     *                                  align_i        align_j
     *  compressed formats              block width    block height
-    *  PIPE_FORMAT_Z16_UNORM           8              4
-    *  PIPE_FORMAT_S8_UINT             8              8
+    *  GEN6_FORMAT_R16_UNORM           8              4
+    *  GEN6_FORMAT_R8_UINT             8              8
     *  other depth/stencil formats     4              4
     *  2x or 4x multisampled           4 or 8         4
     *  tiled Y                         4 or 8         4 (if rt)
-    *  PIPE_FORMAT_R32G32B32_FLOAT     4 or 8         2
+    *  GEN6_FORMAT_R32G32B32_FLOAT     4 or 8         2
     *  others                          4 or 8         2 or 4
     */
-
-   if (params->compressed) {
-      /* this happens to be the case */
-      img->align_i = img->block_width;
-      img->align_j = img->block_height;
-   } else if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7)) {
-         switch (img->format) {
-         case PIPE_FORMAT_Z16_UNORM:
-            img->align_i = 8;
-            img->align_j = 4;
-            break;
-         case PIPE_FORMAT_S8_UINT:
-            img->align_i = 8;
-            img->align_j = 8;
-            break;
-         default:
-            img->align_i = 4;
-            img->align_j = 4;
-            break;
-         }
-      } else {
-         switch (img->format) {
-         case PIPE_FORMAT_S8_UINT:
-            img->align_i = 4;
-            img->align_j = 2;
-            break;
-         default:
-            img->align_i = 4;
-            img->align_j = 4;
-            break;
-         }
+   if (info->compressed) {
+      i = info->block_width;
+      j = info->block_height;
+   } else if (info->bind_zs) {
+      switch (info->format) {
+      case GEN6_FORMAT_R16_UNORM:
+         i = 8;
+         j = 4;
+         break;
+      case GEN6_FORMAT_R8_UINT:
+         i = 8;
+         j = 8;
+         break;
+      default:
+         i = 4;
+         j = 4;
+         break;
       }
    } else {
       const bool valign_4 =
-         (templ->nr_samples > 1) ||
-         (ilo_dev_gen(params->dev) >= ILO_GEN(8)) ||
-         (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          img->tiling == GEN6_TILING_Y &&
-          (templ->bind & PIPE_BIND_RENDER_TARGET));
+         (info->sample_count > 1 || ilo_dev_gen(dev) >= ILO_GEN(8) ||
+          (tiling == GEN6_TILING_Y && info->bind_surface_dp_render));
 
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          ilo_dev_gen(params->dev) <= ILO_GEN(7.5) && valign_4)
-         assert(img->format != PIPE_FORMAT_R32G32B32_FLOAT);
+      if (ilo_dev_gen(dev) < ILO_GEN(8) && valign_4)
+         assert(info->format != GEN6_FORMAT_R32G32B32_FLOAT);
 
-      img->align_i = 4;
-      img->align_j = (valign_4) ? 4 : 2;
+      i = 4;
+      j = (valign_4) ? 4 : 2;
    }
 
-   /*
-    * the fact that align i and j are multiples of block width and height
-    * respectively is what makes the size of the bo a multiple of the block
-    * size, slices start at block boundaries, and many of the computations
-    * work.
-    */
-   assert(img->align_i % img->block_width == 0);
-   assert(img->align_j % img->block_height == 0);
-
-   /* make sure align() works */
-   assert(util_is_power_of_two(img->align_i) &&
-          util_is_power_of_two(img->align_j));
-   assert(util_is_power_of_two(img->block_width) &&
-          util_is_power_of_two(img->block_height));
-}
-
-static void
-img_init_tiling(struct ilo_image *img,
-                const struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-   unsigned preferred_tilings = params->valid_tilings;
-
-   /* no fencing nor BLT support */
-   if (preferred_tilings & ~IMAGE_TILING_W)
-      preferred_tilings &= ~IMAGE_TILING_W;
-
-   if (templ->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW)) {
-      /*
-       * heuristically set a minimum width/height for enabling tiling
-       */
-      if (img->width0 < 64 && (preferred_tilings & ~IMAGE_TILING_X))
-         preferred_tilings &= ~IMAGE_TILING_X;
-
-      if ((img->width0 < 32 || img->height0 < 16) &&
-          (img->width0 < 16 || img->height0 < 32) &&
-          (preferred_tilings & ~IMAGE_TILING_Y))
-         preferred_tilings &= ~IMAGE_TILING_Y;
-   } else {
-      /* force linear if we are not sure where the texture is bound to */
-      if (preferred_tilings & IMAGE_TILING_NONE)
-         preferred_tilings &= IMAGE_TILING_NONE;
-   }
-
-   /* prefer tiled over linear */
-   if (preferred_tilings & IMAGE_TILING_Y)
-      img->tiling = GEN6_TILING_Y;
-   else if (preferred_tilings & IMAGE_TILING_X)
-      img->tiling = GEN6_TILING_X;
-   else if (preferred_tilings & IMAGE_TILING_W)
-      img->tiling = GEN8_TILING_W;
-   else
-      img->tiling = GEN6_TILING_NONE;
-}
-
-static void
-img_init_walk_gen7(struct ilo_image *img,
-                   const struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-
-   /*
-    * It is not explicitly states, but render targets are expected to be
-    * UMS/CMS (samples non-interleaved) and depth/stencil buffers are expected
-    * to be IMS (samples interleaved).
-    *
-    * See "Multisampled Surface Storage Format" field of SURFACE_STATE.
-    */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
-      /*
-       * From the Ivy Bridge PRM, volume 1 part 1, page 111:
-       *
-       *     "note that the depth buffer and stencil buffer have an implied
-       *      value of ARYSPC_FULL"
-       */
-      img->walk = (templ->target == PIPE_TEXTURE_3D) ?
-         ILO_IMAGE_WALK_3D : ILO_IMAGE_WALK_LAYER;
-
-      img->interleaved_samples = true;
-   } else {
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 66:
-       *
-       *     "If Multisampled Surface Storage Format is MSFMT_MSS and Number
-       *      of Multisamples is not MULTISAMPLECOUNT_1, this field (Surface
-       *      Array Spacing) must be set to ARYSPC_LOD0."
-       *
-       * As multisampled resources are not mipmapped, we never use
-       * ARYSPC_FULL for them.
-       */
-      if (templ->nr_samples > 1)
-         assert(templ->last_level == 0);
-
-      img->walk =
-         (templ->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
-         (templ->last_level > 0) ? ILO_IMAGE_WALK_LAYER :
-         ILO_IMAGE_WALK_LOD;
-
-      img->interleaved_samples = false;
-   }
-}
-
-static void
-img_init_walk_gen6(struct ilo_image *img,
-                   const struct ilo_image_params *params)
-{
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 115:
-    *
-    *     "The separate stencil buffer does not support mip mapping, thus the
-    *      storage for LODs other than LOD 0 is not needed. The following
-    *      QPitch equation applies only to the separate stencil buffer:
-    *
-    *        QPitch = h_0"
-    *
-    * GEN6 does not support compact spacing otherwise.
-    */
-   img->walk =
-      (params->templ->target == PIPE_TEXTURE_3D) ? ILO_IMAGE_WALK_3D :
-      (img->format == PIPE_FORMAT_S8_UINT) ? ILO_IMAGE_WALK_LOD :
-      ILO_IMAGE_WALK_LAYER;
-
-   /* GEN6 supports only interleaved samples */
-   img->interleaved_samples = true;
-}
-
-static void
-img_init_walk(struct ilo_image *img,
-              const struct ilo_image_params *params)
-{
-   if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-      img_init_walk_gen7(img, params);
-   else
-      img_init_walk_gen6(img, params);
-}
-
-static unsigned
-img_get_valid_tilings(const struct ilo_image *img,
-                      const struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-   const enum pipe_format format = img->format;
-   unsigned valid_tilings = params->valid_tilings;
-
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 2, page 32:
-    *
-    *     "Display/Overlay   Y-Major not supported.
-    *                        X-Major required for Async Flips"
-    */
-   if (unlikely(templ->bind & PIPE_BIND_SCANOUT))
-      valid_tilings &= IMAGE_TILING_X;
-
-   /*
-    * From the Sandy Bridge PRM, volume 3 part 2, page 158:
-    *
-    *     "The cursor surface address must be 4K byte aligned. The cursor must
-    *      be in linear memory, it cannot be tiled."
-    */
-   if (unlikely(templ->bind & (PIPE_BIND_CURSOR | PIPE_BIND_LINEAR)))
-      valid_tilings &= IMAGE_TILING_NONE;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 318:
-    *
-    *     "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear
-    *      Depth Buffer is not supported."
-    *
-    *     "The Depth Buffer, if tiled, must use Y-Major tiling."
-    *
-    * From the Sandy Bridge PRM, volume 1 part 2, page 22:
-    *
-    *     "W-Major Tile Format is used for separate stencil."
-    */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
-      switch (format) {
-      case PIPE_FORMAT_S8_UINT:
-         valid_tilings &= IMAGE_TILING_W;
-         break;
-      default:
-         valid_tilings &= IMAGE_TILING_Y;
-         break;
-      }
-   }
-
-   if (templ->bind & PIPE_BIND_RENDER_TARGET) {
-      /*
-       * From the Sandy Bridge PRM, volume 1 part 2, page 32:
-       *
-       *     "NOTE: 128BPE Format Color buffer ( render target ) MUST be
-       *      either TileX or Linear."
-       *
-       * From the Haswell PRM, volume 5, page 32:
-       *
-       *     "NOTE: 128 BPP format color buffer (render target) supports
-       *      Linear, TiledX and TiledY."
-       */
-      if (ilo_dev_gen(params->dev) < ILO_GEN(7.5) && img->block_size == 16)
-         valid_tilings &= ~IMAGE_TILING_Y;
-
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 63:
-       *
-       *     "This field (Surface Vertical Aligment) must be set to VALIGN_4
-       *      for all tiled Y Render Target surfaces."
-       *
-       *     "VALIGN_4 is not supported for surface format R32G32B32_FLOAT."
-       */
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7) &&
-          ilo_dev_gen(params->dev) <= ILO_GEN(7.5) &&
-          img->format == PIPE_FORMAT_R32G32B32_FLOAT)
-         valid_tilings &= ~IMAGE_TILING_Y;
-
-      valid_tilings &= ~IMAGE_TILING_W;
-   }
-
-   if (templ->bind & PIPE_BIND_SAMPLER_VIEW) {
-      if (ilo_dev_gen(params->dev) < ILO_GEN(8))
-         valid_tilings &= ~IMAGE_TILING_W;
-   }
-
-   /* no conflicting binding flags */
-   assert(valid_tilings);
-
-   return valid_tilings;
-}
-
-static void
-img_init_size_and_format(struct ilo_image *img,
-                         struct ilo_image_params *params)
-{
-   const struct pipe_resource *templ = params->templ;
-   enum pipe_format format = templ->format;
-   bool require_separate_stencil = false;
-
-   img->target = templ->target;
-   img->width0 = templ->width0;
-   img->height0 = templ->height0;
-   img->depth0 = templ->depth0;
-   img->array_size = templ->array_size;
-   img->level_count = templ->last_level + 1;
-   img->sample_count = (templ->nr_samples) ? templ->nr_samples : 1;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 317:
-    *
-    *     "This field (Separate Stencil Buffer Enable) must be set to the same
-    *      value (enabled or disabled) as Hierarchical Depth Buffer Enable."
-    *
-    * GEN7+ requires separate stencil buffers.
-    */
-   if (templ->bind & PIPE_BIND_DEPTH_STENCIL) {
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-         require_separate_stencil = true;
-      else
-         require_separate_stencil = (img->aux.type == ILO_IMAGE_AUX_HIZ);
-   }
-
-   switch (format) {
-   case PIPE_FORMAT_ETC1_RGB8:
-      format = PIPE_FORMAT_R8G8B8X8_UNORM;
-      break;
-   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-      if (require_separate_stencil) {
-         format = PIPE_FORMAT_Z24X8_UNORM;
-         img->separate_stencil = true;
-      }
-      break;
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      if (require_separate_stencil) {
-         format = PIPE_FORMAT_Z32_FLOAT;
-         img->separate_stencil = true;
-      }
-      break;
-   default:
-      break;
-   }
-
-   img->format = format;
-   img->block_width = util_format_get_blockwidth(format);
-   img->block_height = util_format_get_blockheight(format);
-   img->block_size = util_format_get_blocksize(format);
-
-   params->valid_tilings = img_get_valid_tilings(img, params);
-   params->compressed = util_format_is_compressed(img->format);
+   *align_i = i;
+   *align_j = j;
 }
 
 static bool
-img_want_mcs(const struct ilo_image *img,
-             const struct ilo_image_params *params)
+image_init_gen6_hardware_layout(const struct ilo_dev *dev,
+                                const struct ilo_image_info *info,
+                                struct ilo_image_layout *layout)
 {
-   const struct pipe_resource *templ = params->templ;
-   bool want_mcs = false;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   /* MCS is for RT on GEN7+ */
-   if (ilo_dev_gen(params->dev) < ILO_GEN(7))
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      layout->walk = image_get_gen7_walk(dev, info);
+   else
+      layout->walk = image_get_gen6_walk(dev, info);
+
+   layout->interleaved_samples =
+      image_get_gen6_interleaved_samples(dev, info);
+
+   layout->valid_tilings = image_get_gen6_valid_tilings(dev, info);
+   if (!layout->valid_tilings)
       return false;
 
-   if (templ->target != PIPE_TEXTURE_2D ||
-       !(templ->bind & PIPE_BIND_RENDER_TARGET))
-      return false;
+   layout->tiling = image_get_gen6_tiling(dev, info, layout->valid_tilings);
 
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 77:
-    *
-    *     "For Render Target and Sampling Engine Surfaces:If the surface is
-    *      multisampled (Number of Multisamples any value other than
-    *      MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled."
-    *
-    *     "This field must be set to 0 for all SINT MSRTs when all RT channels
-    *      are not written"
-    */
-   if (templ->nr_samples > 1 && !util_format_is_pure_sint(templ->format)) {
-      want_mcs = true;
-   } else if (templ->nr_samples <= 1) {
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 326:
-       *
-       *     "When MCS is buffer is used for color clear of non-multisampler
-       *      render target, the following restrictions apply.
-       *      - Support is limited to tiled render targets.
-       *      - Support is for non-mip-mapped and non-array surface types
-       *        only.
-       *      - Clear is supported only on the full RT; i.e., no partial clear
-       *        or overlapping clears.
-       *      - MCS buffer for non-MSRT is supported only for RT formats
-       *        32bpp, 64bpp and 128bpp.
-       *      ..."
-       */
-      if (img->tiling != GEN6_TILING_NONE &&
-          templ->last_level == 0 && templ->array_size == 1) {
-         switch (img->block_size) {
-         case 4:
-         case 8:
-         case 16:
-            want_mcs = true;
-            break;
-         default:
-            break;
-         }
-      }
+   if (image_get_gen6_hiz_enable(dev, info))
+      layout->aux = ILO_IMAGE_AUX_HIZ;
+   else if (ilo_dev_gen(dev) >= ILO_GEN(7) &&
+            image_get_gen7_mcs_enable(dev, info, layout->tiling))
+      layout->aux = ILO_IMAGE_AUX_MCS;
+   else
+      layout->aux = ILO_IMAGE_AUX_NONE;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      image_get_gen7_alignments(dev, info, layout->tiling,
+            &layout->align_i, &layout->align_j);
+   } else {
+      image_get_gen6_alignments(dev, info,
+            &layout->align_i, &layout->align_j);
    }
 
-   return want_mcs;
+   return true;
 }
 
 static bool
-img_want_hiz(const struct ilo_image *img,
-             const struct ilo_image_params *params)
+image_init_gen6_transfer_layout(const struct ilo_dev *dev,
+                                const struct ilo_image_info *info,
+                                struct ilo_image_layout *layout)
 {
-   const struct pipe_resource *templ = params->templ;
-   const struct util_format_description *desc =
-      util_format_description(templ->format);
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (ilo_debug & ILO_DEBUG_NOHIZ)
-      return false;
-
-   /* we want 8x4 aligned levels */
-   if (templ->target == PIPE_TEXTURE_1D)
-      return false;
-
-   if (!(templ->bind & PIPE_BIND_DEPTH_STENCIL))
-      return false;
-
-   if (!util_format_has_depth(desc))
-      return false;
-
-   /* no point in having HiZ */
-   if (templ->usage == PIPE_USAGE_STAGING)
-      return false;
-
-   /*
-    * As can be seen in img_calculate_hiz_size(), HiZ may not be enabled
-    * for every level.  This is generally fine except on GEN6, where HiZ and
-    * separate stencil are enabled and disabled at the same time.  When the
-    * format is PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, enabling and disabling HiZ
-    * can result in incompatible formats.
-    */
-   if (ilo_dev_gen(params->dev) == ILO_GEN(6) &&
-       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
-       templ->last_level)
-      return false;
+   /* we can define our own layout to save space */
+   layout->walk = ILO_IMAGE_WALK_LOD;
+   layout->interleaved_samples = false;
+   layout->valid_tilings = IMAGE_TILING_NONE;
+   layout->tiling = GEN6_TILING_NONE;
+   layout->aux = ILO_IMAGE_AUX_NONE;
+   layout->align_i = info->block_width;
+   layout->align_j = info->block_height;
 
    return true;
 }
 
 static void
-img_init_aux(struct ilo_image *img,
-             const struct ilo_image_params *params)
+image_get_gen6_slice_size(const struct ilo_dev *dev,
+                          const struct ilo_image_info *info,
+                          const struct ilo_image_layout *layout,
+                          uint8_t level,
+                          int *width, int *height)
 {
-   if (img_want_hiz(img, params))
-      img->aux.type = ILO_IMAGE_AUX_HIZ;
-   else if (img_want_mcs(img, params))
-      img->aux.type = ILO_IMAGE_AUX_MCS;
+   int w, h;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   w = u_minify(info->width, level);
+   h = u_minify(info->height, level);
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 1, page 114:
+    *
+    *     "The dimensions of the mip maps are first determined by applying the
+    *      sizing algorithm presented in Non-Power-of-Two Mipmaps above. Then,
+    *      if necessary, they are padded out to compression block boundaries."
+    */
+   w = align(w, info->block_width);
+   h = align(h, info->block_height);
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 1, page 111:
+    *
+    *     "If the surface is multisampled (4x), these values must be adjusted
+    *      as follows before proceeding:
+    *
+    *        W_L = ceiling(W_L / 2) * 4
+    *        H_L = ceiling(H_L / 2) * 4"
+    *
+    * From the Ivy Bridge PRM, volume 1 part 1, page 108:
+    *
+    *     "If the surface is multisampled and it is a depth or stencil surface
+    *      or Multisampled Surface StorageFormat in SURFACE_STATE is
+    *      MSFMT_DEPTH_STENCIL, W_L and H_L must be adjusted as follows before
+    *      proceeding:
+    *
+    *        #samples  W_L =                    H_L =
+    *        2         ceiling(W_L / 2) * 4     HL [no adjustment]
+    *        4         ceiling(W_L / 2) * 4     ceiling(H_L / 2) * 4
+    *        8         ceiling(W_L / 2) * 8     ceiling(H_L / 2) * 4
+    *        16        ceiling(W_L / 2) * 8     ceiling(H_L / 2) * 8"
+    *
+    * For interleaved samples (4x), where pixels
+    *
+    *   (x, y  ) (x+1, y  )
+    *   (x, y+1) (x+1, y+1)
+    *
+    * would be is occupied by
+    *
+    *   (x, y  , si0) (x+1, y  , si0) (x, y  , si1) (x+1, y  , si1)
+    *   (x, y+1, si0) (x+1, y+1, si0) (x, y+1, si1) (x+1, y+1, si1)
+    *   (x, y  , si2) (x+1, y  , si2) (x, y  , si3) (x+1, y  , si3)
+    *   (x, y+1, si2) (x+1, y+1, si2) (x, y+1, si3) (x+1, y+1, si3)
+    *
+    * Thus the need to
+    *
+    *   w = align(w, 2) * 2;
+    *   y = align(y, 2) * 2;
+    */
+   if (layout->interleaved_samples) {
+      switch (info->sample_count) {
+      case 1:
+         break;
+      case 2:
+         w = align(w, 2) * 2;
+         break;
+      case 4:
+         w = align(w, 2) * 2;
+         h = align(h, 2) * 2;
+         break;
+      case 8:
+         w = align(w, 2) * 4;
+         h = align(h, 2) * 2;
+         break;
+      case 16:
+         w = align(w, 2) * 4;
+         h = align(h, 2) * 4;
+         break;
+      default:
+         assert(!"unsupported sample count");
+         break;
+      }
+   }
+
+   /*
+    * From the Ivy Bridge PRM, volume 1 part 1, page 108:
+    *
+    *     "For separate stencil buffer, the width must be mutiplied by 2 and
+    *      height divided by 2..."
+    *
+    * To make things easier (for transfer), we will just double the stencil
+    * stride in 3DSTATE_STENCIL_BUFFER.
+    */
+   w = align(w, layout->align_i);
+   h = align(h, layout->align_j);
+
+   *width = w;
+   *height = h;
+}
+
+static int
+image_get_gen6_layer_count(const struct ilo_dev *dev,
+                           const struct ilo_image_info *info,
+                           const struct ilo_image_layout *layout)
+{
+   int count = info->array_size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* samples of the same index are stored in a layer */
+   if (!layout->interleaved_samples)
+      count *= info->sample_count;
+
+   return count;
 }
 
 static void
-img_align(struct ilo_image *img, struct ilo_image_params *params)
+image_get_gen6_walk_layer_heights(const struct ilo_dev *dev,
+                                  const struct ilo_image_info *info,
+                                  struct ilo_image_layout *layout)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   layout->walk_layer_h0 = layout->lods[0].slice_height;
+
+   if (info->level_count > 1) {
+      layout->walk_layer_h1 = layout->lods[1].slice_height;
+   } else {
+      int dummy;
+      image_get_gen6_slice_size(dev, info, layout, 1,
+            &dummy, &layout->walk_layer_h1);
+   }
+
+   if (image_get_gen6_layer_count(dev, info, layout) == 1) {
+      layout->walk_layer_height = 0;
+      return;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 1 part 1, page 115:
+    *
+    *     "The following equation is used for surface formats other than
+    *      compressed textures:
+    *
+    *        QPitch = (h0 + h1 + 11j)"
+    *
+    *     "The equation for compressed textures (BC* and FXT1 surface formats)
+    *      follows:
+    *
+    *        QPitch = (h0 + h1 + 11j) / 4"
+    *
+    *     "[DevSNB] Errata: Sampler MSAA Qpitch will be 4 greater than the
+    *      value calculated in the equation above, for every other odd Surface
+    *      Height starting from 1 i.e. 1,5,9,13"
+    *
+    * From the Ivy Bridge PRM, volume 1 part 1, page 111-112:
+    *
+    *     "If Surface Array Spacing is set to ARYSPC_FULL (note that the depth
+    *      buffer and stencil buffer have an implied value of ARYSPC_FULL):
+    *
+    *        QPitch = (h0 + h1 + 12j)
+    *        QPitch = (h0 + h1 + 12j) / 4 (compressed)
+    *
+    *      (There are many typos or missing words here...)"
+    *
+    * To access the N-th slice, an offset of (Stride * QPitch * N) is added to
+    * the base address.  The PRM divides QPitch by 4 for compressed formats
+    * because the block height for those formats are 4, and it wants QPitch to
+    * mean the number of memory rows, as opposed to texel rows, between
+    * slices.  Since we use texel rows everywhere, we do not need to divide
+    * QPitch by 4.
+    */
+   layout->walk_layer_height = layout->walk_layer_h0 + layout->walk_layer_h1 +
+      ((ilo_dev_gen(dev) >= ILO_GEN(7)) ? 12 : 11) * layout->align_j;
+
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && info->sample_count > 1 &&
+       info->height % 4 == 1)
+      layout->walk_layer_height += 4;
+}
+
+static void
+image_get_gen6_monolithic_size(const struct ilo_dev *dev,
+                               const struct ilo_image_info *info,
+                               struct ilo_image_layout *layout,
+                               int max_x, int max_y)
 {
-   const struct pipe_resource *templ = params->templ;
    int align_w = 1, align_h = 1, pad_h = 0;
 
+   ILO_DEV_ASSERT(dev, 6, 8);
+
    /*
     * From the Sandy Bridge PRM, volume 1 part 1, page 118:
     *
@@ -864,15 +807,15 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
     *      padding purposes. The value of 4 for j still applies for mip level
     *      alignment and QPitch calculation."
     */
-   if (templ->bind & PIPE_BIND_SAMPLER_VIEW) {
-      align_w = MAX2(align_w, img->align_i);
-      align_h = MAX2(align_h, img->align_j);
+   if (info->bind_surface_sampler) {
+      align_w = MAX2(align_w, layout->align_i);
+      align_h = MAX2(align_h, layout->align_j);
 
-      if (templ->target == PIPE_TEXTURE_CUBE)
+      if (info->type == GEN6_SURFTYPE_CUBE)
          pad_h += 2;
 
-      if (params->compressed)
-         align_h = MAX2(align_h, img->align_j * 2);
+      if (info->compressed)
+         align_h = MAX2(align_h, layout->align_j * 2);
    }
 
    /*
@@ -881,149 +824,288 @@ img_align(struct ilo_image *img, struct ilo_image_params *params)
     *     "If the surface contains an odd number of rows of data, a final row
     *      below the surface must be allocated."
     */
-   if (templ->bind & PIPE_BIND_RENDER_TARGET)
+   if (info->bind_surface_dp_render)
       align_h = MAX2(align_h, 2);
 
    /*
     * Depth Buffer Clear/Resolve works in 8x4 sample blocks.  Pad to allow HiZ
     * for unaligned non-mipmapped and non-array images.
     */
-   if (img->aux.type == ILO_IMAGE_AUX_HIZ &&
-       templ->last_level == 0 &&
-       templ->array_size == 1 &&
-       templ->depth0 == 1) {
+   if (layout->aux == ILO_IMAGE_AUX_HIZ &&
+       info->level_count == 1 && info->array_size == 1 && info->depth == 1) {
       align_w = MAX2(align_w, 8);
       align_h = MAX2(align_h, 4);
    }
 
-   params->max_x = align(params->max_x, align_w);
-   params->max_y = align(params->max_y + pad_h, align_h);
+   layout->monolithic_width = align(max_x, align_w);
+   layout->monolithic_height = align(max_y + pad_h, align_h);
 }
 
-/* note that this may force the texture to be linear */
 static void
-img_calculate_bo_size(struct ilo_image *img,
-                      const struct ilo_image_params *params)
+image_get_gen6_lods(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info,
+                    struct ilo_image_layout *layout)
 {
-   assert(params->max_x % img->block_width == 0);
-   assert(params->max_y % img->block_height == 0);
-   assert(img->walk_layer_height % img->block_height == 0);
+   const int layer_count = image_get_gen6_layer_count(dev, info, layout);
+   int cur_x, cur_y, max_x, max_y;
+   uint8_t lv;
 
-   img->bo_stride =
-      (params->max_x / img->block_width) * img->block_size;
-   img->bo_height = params->max_y / img->block_height;
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   while (true) {
-      unsigned w = img->bo_stride, h = img->bo_height;
-      unsigned align_w, align_h;
+   cur_x = 0;
+   cur_y = 0;
+   max_x = 0;
+   max_y = 0;
+   for (lv = 0; lv < info->level_count; lv++) {
+      int slice_w, slice_h, lod_w, lod_h;
 
-      /*
-       * From the Haswell PRM, volume 5, page 163:
-       *
-       *     "For linear surfaces, additional padding of 64 bytes is required
-       *      at the bottom of the surface. This is in addition to the padding
-       *      required above."
-       */
-      if (ilo_dev_gen(params->dev) >= ILO_GEN(7.5) &&
-          (params->templ->bind & PIPE_BIND_SAMPLER_VIEW) &&
-          img->tiling == GEN6_TILING_NONE)
-         h += (64 + img->bo_stride - 1) / img->bo_stride;
+      image_get_gen6_slice_size(dev, info, layout, lv, &slice_w, &slice_h);
 
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-       *
-       *     "- For linear render target surfaces, the pitch must be a
-       *        multiple of the element size for non-YUV surface formats.
-       *        Pitch must be a multiple of 2 * element size for YUV surface
-       *        formats.
-       *      - For other linear surfaces, the pitch can be any multiple of
-       *        bytes.
-       *      - For tiled surfaces, the pitch must be a multiple of the tile
-       *        width."
-       *
-       * Different requirements may exist when the bo is used in different
-       * places, but our alignments here should be good enough that we do not
-       * need to check params->templ->bind.
-       */
-      switch (img->tiling) {
-      case GEN6_TILING_X:
-         align_w = 512;
-         align_h = 8;
+      layout->lods[lv].x = cur_x;
+      layout->lods[lv].y = cur_y;
+      layout->lods[lv].slice_width = slice_w;
+      layout->lods[lv].slice_height = slice_h;
+
+      switch (layout->walk) {
+      case ILO_IMAGE_WALK_LAYER:
+         lod_w = slice_w;
+         lod_h = slice_h;
+
+         /* MIPLAYOUT_BELOW */
+         if (lv == 1)
+            cur_x += lod_w;
+         else
+            cur_y += lod_h;
          break;
-      case GEN6_TILING_Y:
-         align_w = 128;
-         align_h = 32;
+      case ILO_IMAGE_WALK_LOD:
+         lod_w = slice_w;
+         lod_h = slice_h * layer_count;
+
+         if (lv == 1)
+            cur_x += lod_w;
+         else
+            cur_y += lod_h;
+
+         /* every LOD begins at tile boundaries */
+         if (info->level_count > 1) {
+            assert(info->format == GEN6_FORMAT_R8_UINT);
+            cur_x = align(cur_x, 64);
+            cur_y = align(cur_y, 64);
+         }
          break;
-      case GEN8_TILING_W:
-         /*
-          * From the Sandy Bridge PRM, volume 1 part 2, page 22:
-          *
-          *     "A 4KB tile is subdivided into 8-high by 8-wide array of
-          *      Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8
-          *      bytes."
-          */
-         align_w = 64;
-         align_h = 64;
+      case ILO_IMAGE_WALK_3D:
+         {
+            const int slice_count = u_minify(info->depth, lv);
+            const int slice_count_per_row = 1 << lv;
+            const int row_count =
+               (slice_count + slice_count_per_row - 1) / slice_count_per_row;
+
+            lod_w = slice_w * slice_count_per_row;
+            lod_h = slice_h * row_count;
+         }
+
+         cur_y += lod_h;
          break;
       default:
-         assert(img->tiling == GEN6_TILING_NONE);
-         /* some good enough values */
-         align_w = 64;
-         align_h = 2;
+         assert(!"unknown walk type");
+         lod_w = 0;
+         lod_h = 0;
          break;
       }
 
-      w = align(w, align_w);
-      h = align(h, align_h);
-
-      /* make sure the bo is mappable */
-      if (img->tiling != GEN6_TILING_NONE) {
-         /*
-          * Usually only the first 256MB of the GTT is mappable.
-          *
-          * See also how intel_context::max_gtt_map_object_size is calculated.
-          */
-         const size_t mappable_gtt_size = 256 * 1024 * 1024;
-
-         /*
-          * Be conservative.  We may be able to switch from VALIGN_4 to
-          * VALIGN_2 if the image was Y-tiled, but let's keep it simple.
-          */
-         if (mappable_gtt_size / w / 4 < h) {
-            if (params->valid_tilings & IMAGE_TILING_NONE) {
-               img->tiling = GEN6_TILING_NONE;
-               /* MCS support for non-MSRTs is limited to tiled RTs */
-               if (img->aux.type == ILO_IMAGE_AUX_MCS &&
-                   params->templ->nr_samples <= 1)
-                  img->aux.type = ILO_IMAGE_AUX_NONE;
-
-               continue;
-            } else {
-               ilo_warn("cannot force texture to be linear\n");
-            }
-         }
-      }
-
-      img->bo_stride = w;
-      img->bo_height = h;
-      break;
+      if (max_x < layout->lods[lv].x + lod_w)
+         max_x = layout->lods[lv].x + lod_w;
+      if (max_y < layout->lods[lv].y + lod_h)
+         max_y = layout->lods[lv].y + lod_h;
    }
+
+   if (layout->walk == ILO_IMAGE_WALK_LAYER) {
+      image_get_gen6_walk_layer_heights(dev, info, layout);
+      if (layer_count > 1)
+         max_y += layout->walk_layer_height * (layer_count - 1);
+   } else {
+      layout->walk_layer_h0 = 0;
+      layout->walk_layer_h1 = 0;
+      layout->walk_layer_height = 0;
+   }
+
+   image_get_gen6_monolithic_size(dev, info, layout, max_x, max_y);
 }
 
-static void
-img_calculate_hiz_size(struct ilo_image *img,
-                       const struct ilo_image_params *params)
+static bool
+image_bind_gpu(const struct ilo_image_info *info)
 {
-   const struct pipe_resource *templ = params->templ;
-   const unsigned hz_align_j = 8;
+   return (info->bind_surface_sampler ||
+           info->bind_surface_dp_render ||
+           info->bind_surface_dp_typed ||
+           info->bind_zs ||
+           info->bind_scanout ||
+           info->bind_cursor);
+}
+
+static bool
+image_validate_gen6(const struct ilo_dev *dev,
+                    const struct ilo_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 314:
+    *
+    *     "The separate stencil buffer is always enabled, thus the field in
+    *      3DSTATE_DEPTH_BUFFER to explicitly enable the separate stencil
+    *      buffer has been removed Surface formats with interleaved depth and
+    *      stencil are no longer supported"
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && info->bind_zs)
+      assert(!info->interleaved_stencil);
+
+   return true;
+}
+
+static bool
+image_get_gen6_layout(const struct ilo_dev *dev,
+                      const struct ilo_image_info *info,
+                      struct ilo_image_layout *layout)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!image_validate_gen6(dev, info))
+      return false;
+
+   if (image_bind_gpu(info) || info->level_count > 1) {
+      if (!image_init_gen6_hardware_layout(dev, info, layout))
+         return false;
+   } else {
+      if (!image_init_gen6_transfer_layout(dev, info, layout))
+         return false;
+   }
+
+   /*
+    * the fact that align i and j are multiples of block width and height
+    * respectively is what makes the size of the bo a multiple of the block
+    * size, slices start at block boundaries, and many of the computations
+    * work.
+    */
+   assert(layout->align_i % info->block_width == 0);
+   assert(layout->align_j % info->block_height == 0);
+
+   /* make sure align() works */
+   assert(util_is_power_of_two(layout->align_i) &&
+          util_is_power_of_two(layout->align_j));
+   assert(util_is_power_of_two(info->block_width) &&
+          util_is_power_of_two(info->block_height));
+
+   image_get_gen6_lods(dev, info, layout);
+
+   assert(layout->walk_layer_height % info->block_height == 0);
+   assert(layout->monolithic_width % info->block_width == 0);
+   assert(layout->monolithic_height % info->block_height == 0);
+
+   return true;
+}
+
+static bool
+image_set_gen6_bo_size(struct ilo_image *img,
+                       const struct ilo_dev *dev,
+                       const struct ilo_image_info *info,
+                       const struct ilo_image_layout *layout)
+{
+   int stride, height;
+   int align_w, align_h;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   stride = (layout->monolithic_width / info->block_width) * info->block_size;
+   height = layout->monolithic_height / info->block_height;
+
+   /*
+    * From the Haswell PRM, volume 5, page 163:
+    *
+    *     "For linear surfaces, additional padding of 64 bytes is required
+    *      at the bottom of the surface. This is in addition to the padding
+    *      required above."
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && info->bind_surface_sampler &&
+       layout->tiling == GEN6_TILING_NONE)
+      height += (64 + stride - 1) / stride;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
+    *
+    *     "- For linear render target surfaces, the pitch must be a multiple
+    *        of the element size for non-YUV surface formats.  Pitch must be a
+    *        multiple of 2 * element size for YUV surface formats.
+    *
+    *      - For other linear surfaces, the pitch can be any multiple of
+    *        bytes.
+    *      - For tiled surfaces, the pitch must be a multiple of the tile
+    *        width."
+    *
+    * Different requirements may exist when the image is used in different
+    * places, but our alignments here should be good enough that we do not
+    * need to check info->bind_x.
+    */
+   switch (layout->tiling) {
+   case GEN6_TILING_X:
+      align_w = 512;
+      align_h = 8;
+      break;
+   case GEN6_TILING_Y:
+      align_w = 128;
+      align_h = 32;
+      break;
+   case GEN8_TILING_W:
+      /*
+       * From the Sandy Bridge PRM, volume 1 part 2, page 22:
+       *
+       *     "A 4KB tile is subdivided into 8-high by 8-wide array of
+       *      Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8
+       *      bytes."
+       */
+      align_w = 64;
+      align_h = 64;
+      break;
+   default:
+      assert(layout->tiling == GEN6_TILING_NONE);
+      /* some good enough values */
+      align_w = 64;
+      align_h = 2;
+      break;
+   }
+
+   if (info->force_bo_stride) {
+      if (info->force_bo_stride % align_w || info->force_bo_stride < stride)
+         return false;
+
+      img->bo_stride = info->force_bo_stride;
+   } else {
+      img->bo_stride = align(stride, align_w);
+   }
+
+   img->bo_height = align(height, align_h);
+
+   return true;
+}
+
+static bool
+image_set_gen6_hiz(struct ilo_image *img,
+                   const struct ilo_dev *dev,
+                   const struct ilo_image_info *info,
+                   const struct ilo_image_layout *layout)
+{
+   const int hz_align_j = 8;
    enum ilo_image_walk_type hz_walk;
-   unsigned hz_width, hz_height, lv;
-   unsigned hz_clear_w, hz_clear_h;
+   int hz_width, hz_height;
+   int hz_clear_w, hz_clear_h;
+   uint8_t lv;
 
-   assert(img->aux.type == ILO_IMAGE_AUX_HIZ);
+   ILO_DEV_ASSERT(dev, 6, 8);
 
-   assert(img->walk == ILO_IMAGE_WALK_LAYER ||
-          img->walk == ILO_IMAGE_WALK_3D);
+   assert(layout->aux == ILO_IMAGE_AUX_HIZ);
+
+   assert(layout->walk == ILO_IMAGE_WALK_LAYER ||
+          layout->walk == ILO_IMAGE_WALK_3D);
 
    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 312:
@@ -1036,8 +1118,8 @@ img_calculate_hiz_size(struct ilo_image *img,
     *
     * We will put all LODs in a single bo with ILO_IMAGE_WALK_LOD.
     */
-   if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
-      hz_walk = img->walk;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      hz_walk = layout->walk;
    else
       hz_walk = ILO_IMAGE_WALK_LOD;
 
@@ -1051,16 +1133,16 @@ img_calculate_hiz_size(struct ilo_image *img,
    switch (hz_walk) {
    case ILO_IMAGE_WALK_LAYER:
       {
-         const unsigned h0 = align(params->h0, hz_align_j);
-         const unsigned h1 = align(params->h1, hz_align_j);
-         const unsigned htail =
-            ((ilo_dev_gen(params->dev) >= ILO_GEN(7)) ? 12 : 11) * hz_align_j;
-         const unsigned hz_qpitch = h0 + h1 + htail;
+         const int h0 = align(layout->walk_layer_h0, hz_align_j);
+         const int h1 = align(layout->walk_layer_h1, hz_align_j);
+         const int htail =
+            ((ilo_dev_gen(dev) >= ILO_GEN(7)) ? 12 : 11) * hz_align_j;
+         const int hz_qpitch = h0 + h1 + htail;
 
-         hz_width = align(img->lods[0].slice_width, 16);
+         hz_width = align(layout->lods[0].slice_width, 16);
 
-         hz_height = hz_qpitch * templ->array_size / 2;
-         if (ilo_dev_gen(params->dev) >= ILO_GEN(7))
+         hz_height = hz_qpitch * info->array_size / 2;
+         if (ilo_dev_gen(dev) >= ILO_GEN(7))
             hz_height = align(hz_height, 8);
 
          img->aux.walk_layer_height = hz_qpitch;
@@ -1068,27 +1150,27 @@ img_calculate_hiz_size(struct ilo_image *img,
       break;
    case ILO_IMAGE_WALK_LOD:
       {
-         unsigned lod_tx[PIPE_MAX_TEXTURE_LEVELS];
-         unsigned lod_ty[PIPE_MAX_TEXTURE_LEVELS];
-         unsigned cur_tx, cur_ty;
+         int lod_tx[ILO_IMAGE_MAX_LEVEL_COUNT];
+         int lod_ty[ILO_IMAGE_MAX_LEVEL_COUNT];
+         int cur_tx, cur_ty;
 
          /* figure out the tile offsets of LODs */
          hz_width = 0;
          hz_height = 0;
          cur_tx = 0;
          cur_ty = 0;
-         for (lv = 0; lv <= templ->last_level; lv++) {
-            unsigned tw, th;
+         for (lv = 0; lv < info->level_count; lv++) {
+            int tw, th;
 
             lod_tx[lv] = cur_tx;
             lod_ty[lv] = cur_ty;
 
-            tw = align(img->lods[lv].slice_width, 16);
-            th = align(img->lods[lv].slice_height, hz_align_j) *
-               templ->array_size / 2;
+            tw = align(layout->lods[lv].slice_width, 16);
+            th = align(layout->lods[lv].slice_height, hz_align_j) *
+               info->array_size / 2;
             /* convert to Y-tiles */
-            tw = align(tw, 128) / 128;
-            th = align(th, 32) / 32;
+            tw = (tw + 127) / 128;
+            th = (th + 31) / 32;
 
             if (hz_width < cur_tx + tw)
                hz_width = cur_tx + tw;
@@ -1102,22 +1184,23 @@ img_calculate_hiz_size(struct ilo_image *img,
          }
 
          /* convert tile offsets to memory offsets */
-         for (lv = 0; lv <= templ->last_level; lv++) {
+         for (lv = 0; lv < info->level_count; lv++) {
             img->aux.walk_lod_offsets[lv] =
                (lod_ty[lv] * hz_width + lod_tx[lv]) * 4096;
          }
+
          hz_width *= 128;
          hz_height *= 32;
       }
       break;
    case ILO_IMAGE_WALK_3D:
-      hz_width = align(img->lods[0].slice_width, 16);
+      hz_width = align(layout->lods[0].slice_width, 16);
 
       hz_height = 0;
-      for (lv = 0; lv <= templ->last_level; lv++) {
-         const unsigned h = align(img->lods[lv].slice_height, hz_align_j);
+      for (lv = 0; lv < info->level_count; lv++) {
+         const int h = align(layout->lods[lv].slice_height, hz_align_j);
          /* according to the formula, slices are packed together vertically */
-         hz_height += h * u_minify(templ->depth0, lv);
+         hz_height += h * u_minify(info->depth, lv);
       }
       hz_height /= 2;
       break;
@@ -1136,8 +1219,7 @@ img_calculate_hiz_size(struct ilo_image *img,
     */
    hz_clear_w = 8;
    hz_clear_h = 4;
-   switch (templ->nr_samples) {
-   case 0:
+   switch (info->sample_count) {
    case 1:
    default:
       break;
@@ -1158,33 +1240,38 @@ img_calculate_hiz_size(struct ilo_image *img,
       break;
    }
 
-   for (lv = 0; lv <= templ->last_level; lv++) {
-      if (u_minify(img->width0, lv) % hz_clear_w ||
-          u_minify(img->height0, lv) % hz_clear_h)
+   for (lv = 0; lv < info->level_count; lv++) {
+      if (u_minify(info->width, lv) % hz_clear_w ||
+          u_minify(info->height, lv) % hz_clear_h)
          break;
       img->aux.enables |= 1 << lv;
    }
 
-   /* we padded to allow this in img_align() */
-   if (templ->last_level == 0 && templ->array_size == 1 && templ->depth0 == 1)
+   /* we padded to allow this in image_get_gen6_monolithic_size() */
+   if (info->level_count == 1 && info->array_size == 1 && info->depth == 1)
       img->aux.enables |= 0x1;
 
    /* align to Y-tile */
    img->aux.bo_stride = align(hz_width, 128);
    img->aux.bo_height = align(hz_height, 32);
+
+   return true;
 }
 
-static void
-img_calculate_mcs_size(struct ilo_image *img,
-                       const struct ilo_image_params *params)
+static bool
+image_set_gen7_mcs(struct ilo_image *img,
+                   const struct ilo_dev *dev,
+                   const struct ilo_image_info *info,
+                   const struct ilo_image_layout *layout)
 {
-   const struct pipe_resource *templ = params->templ;
    int mcs_width, mcs_height, mcs_cpp;
    int downscale_x, downscale_y;
 
-   assert(img->aux.type == ILO_IMAGE_AUX_MCS);
+   ILO_DEV_ASSERT(dev, 7, 8);
 
-   if (templ->nr_samples > 1) {
+   assert(layout->aux == ILO_IMAGE_AUX_MCS);
+
+   if (info->sample_count > 1) {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 326, the clear
        * rectangle is scaled down by 8x2 for 4X MSAA and 2x2 for 8X MSAA.  The
@@ -1198,7 +1285,7 @@ img_calculate_mcs_size(struct ilo_image *img,
        * RT.  Similarly, we could reason that an OWord in 4X MCS maps to a 8x2
        * pixel block in the RT.
        */
-      switch (templ->nr_samples) {
+      switch (info->sample_count) {
       case 2:
       case 4:
          downscale_x = 8;
@@ -1217,7 +1304,7 @@ img_calculate_mcs_size(struct ilo_image *img,
          break;
       default:
          assert(!"unsupported sample count");
-         return;
+         return false;
          break;
       }
 
@@ -1226,8 +1313,8 @@ img_calculate_mcs_size(struct ilo_image *img,
        * clear rectangle cannot be masked.  The scale-down clear rectangle
        * thus must be aligned to 2x2, and we need to pad.
        */
-      mcs_width = align(img->width0, downscale_x * 2);
-      mcs_height = align(img->height0, downscale_y * 2);
+      mcs_width = align(info->width, downscale_x * 2);
+      mcs_height = align(info->height, downscale_y * 2);
    } else {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 327:
@@ -1262,18 +1349,18 @@ img_calculate_mcs_size(struct ilo_image *img,
        * anything except for the size of the allocated MCS.  Let's see if we
        * hit out-of-bound access.
        */
-      switch (img->tiling) {
+      switch (layout->tiling) {
       case GEN6_TILING_X:
-         downscale_x = 64 / img->block_size;
+         downscale_x = 64 / info->block_size;
          downscale_y = 2;
          break;
       case GEN6_TILING_Y:
-         downscale_x = 32 / img->block_size;
+         downscale_x = 32 / info->block_size;
          downscale_y = 4;
          break;
       default:
          assert(!"unsupported tiling mode");
-         return;
+         return false;
          break;
       }
 
@@ -1290,181 +1377,75 @@ img_calculate_mcs_size(struct ilo_image *img,
        * The scaled-down clear rectangle must be aligned to 4x4 instead of
        * 2x2, and we need to pad.
        */
-      mcs_width = align(img->width0, downscale_x * 4) / downscale_x;
-      mcs_height = align(img->height0, downscale_y * 4) / downscale_y;
+      mcs_width = align(info->width, downscale_x * 4) / downscale_x;
+      mcs_height = align(info->height, downscale_y * 4) / downscale_y;
       mcs_cpp = 16; /* an OWord */
    }
 
-   img->aux.enables = (1 << (templ->last_level + 1)) - 1;
+   img->aux.enables = (1 << info->level_count) - 1;
    /* align to Y-tile */
    img->aux.bo_stride = align(mcs_width * mcs_cpp, 128);
    img->aux.bo_height = align(mcs_height, 32);
+
+   return true;
 }
 
-static void
-img_init(struct ilo_image *img,
-         struct ilo_image_params *params)
+bool
+ilo_image_init(struct ilo_image *img,
+               const struct ilo_dev *dev,
+               const struct ilo_image_info *info)
 {
-   /* there are hard dependencies between every function here */
+   struct ilo_image_layout layout;
 
-   img_init_aux(img, params);
-   img_init_size_and_format(img, params);
-   img_init_walk(img, params);
-   img_init_tiling(img, params);
-   img_init_alignments(img, params);
-   img_init_lods(img, params);
-   img_init_layer_height(img, params);
+   assert(ilo_is_zeroed(img, sizeof(*img)));
 
-   img_align(img, params);
-   img_calculate_bo_size(img, params);
+   memset(&layout, 0, sizeof(layout));
+   layout.lods = img->lods;
 
-   img->scanout = (params->templ->bind & PIPE_BIND_SCANOUT);
+   if (!image_get_gen6_layout(dev, info, &layout))
+      return false;
 
-   switch (img->aux.type) {
+   img->type = info->type;
+
+   img->format = info->format;
+   img->block_width = info->block_width;
+   img->block_height = info->block_height;
+   img->block_size = info->block_size;
+
+   img->width0 = info->width;
+   img->height0 = info->height;
+   img->depth0 = info->depth;
+   img->array_size = info->array_size;
+   img->level_count = info->level_count;
+   img->sample_count = info->sample_count;
+
+   img->walk = layout.walk;
+   img->interleaved_samples = layout.interleaved_samples;
+
+   img->tiling = layout.tiling;
+
+   img->aux.type = layout.aux;
+
+   img->align_i = layout.align_i;
+   img->align_j = layout.align_j;
+
+   img->walk_layer_height = layout.walk_layer_height;
+
+   if (!image_set_gen6_bo_size(img, dev, info, &layout))
+      return false;
+
+   img->scanout = info->bind_scanout;
+
+   switch (layout.aux) {
    case ILO_IMAGE_AUX_HIZ:
-      img_calculate_hiz_size(img, params);
+      image_set_gen6_hiz(img, dev, info, &layout);
       break;
    case ILO_IMAGE_AUX_MCS:
-      img_calculate_mcs_size(img, params);
+      image_set_gen7_mcs(img, dev, info, &layout);
       break;
    default:
       break;
    }
-}
-
-/**
- * The texutre is for transfer only.  We can define our own layout to save
- * space.
- */
-static void
-img_init_for_transfer(struct ilo_image *img,
-                      const struct ilo_dev *dev,
-                      const struct pipe_resource *templ)
-{
-   const unsigned num_layers = (templ->target == PIPE_TEXTURE_3D) ?
-      templ->depth0 : templ->array_size;
-   unsigned layer_width, layer_height;
-
-   assert(templ->last_level == 0);
-   assert(templ->nr_samples <= 1);
-
-   img->aux.type = ILO_IMAGE_AUX_NONE;
-
-   img->target = templ->target;
-   img->width0 = templ->width0;
-   img->height0 = templ->height0;
-   img->depth0 = templ->depth0;
-   img->array_size = templ->array_size;
-   img->level_count = 1;
-   img->sample_count = 1;
-
-   img->format = templ->format;
-   img->block_width = util_format_get_blockwidth(templ->format);
-   img->block_height = util_format_get_blockheight(templ->format);
-   img->block_size = util_format_get_blocksize(templ->format);
-
-   img->walk = ILO_IMAGE_WALK_LOD;
-
-   img->tiling = GEN6_TILING_NONE;
-
-   img->align_i = img->block_width;
-   img->align_j = img->block_height;
-
-   assert(util_is_power_of_two(img->block_width) &&
-          util_is_power_of_two(img->block_height));
-
-   /* use packed layout */
-   layer_width = align(templ->width0, img->align_i);
-   layer_height = align(templ->height0, img->align_j);
-
-   img->lods[0].slice_width = layer_width;
-   img->lods[0].slice_height = layer_height;
-
-   img->bo_stride = (layer_width / img->block_width) * img->block_size;
-   img->bo_stride = align(img->bo_stride, 64);
-
-   img->bo_height = (layer_height / img->block_height) * num_layers;
-}
-
-/**
- * Initialize the image.  Callers should zero-initialize \p img first.
- */
-void ilo_image_init(struct ilo_image *img,
-                    const struct ilo_dev *dev,
-                    const struct pipe_resource *templ)
-{
-   struct ilo_image_params params;
-   bool transfer_only;
-
-   assert(ilo_is_zeroed(img, sizeof(*img)));
-
-   /* use transfer layout when the texture is never bound to GPU */
-   transfer_only = !(templ->bind & ~(PIPE_BIND_TRANSFER_WRITE |
-                                     PIPE_BIND_TRANSFER_READ));
-   if (transfer_only && templ->last_level == 0 && templ->nr_samples <= 1) {
-      img_init_for_transfer(img, dev, templ);
-      return;
-   }
-
-   memset(&params, 0, sizeof(params));
-   params.dev = dev;
-   params.templ = templ;
-   params.valid_tilings = IMAGE_TILING_ALL;
-
-   img_init(img, &params);
-}
-
-bool
-ilo_image_init_for_imported(struct ilo_image *img,
-                            const struct ilo_dev *dev,
-                            const struct pipe_resource *templ,
-                            enum gen_surface_tiling tiling,
-                            unsigned bo_stride)
-{
-   struct ilo_image_params params;
-
-   assert(ilo_is_zeroed(img, sizeof(*img)));
-
-   if ((tiling == GEN6_TILING_X && bo_stride % 512) ||
-       (tiling == GEN6_TILING_Y && bo_stride % 128) ||
-       (tiling == GEN8_TILING_W && bo_stride % 64))
-      return false;
-
-   memset(&params, 0, sizeof(params));
-   params.dev = dev;
-   params.templ = templ;
-   params.valid_tilings = 1 << tiling;
-
-   img_init(img, &params);
-
-   assert(img->tiling == tiling);
-   if (img->bo_stride > bo_stride)
-      return false;
-
-   img->bo_stride = bo_stride;
-
-   /* assume imported RTs are also scanouts */
-   if (!img->scanout)
-      img->scanout = (templ->bind & PIPE_BIND_RENDER_TARGET);
-
-   return true;
-}
-
-bool
-ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev)
-{
-   /* HiZ is required for separate stencil on Gen6 */
-   if (ilo_dev_gen(dev) == ILO_GEN(6) &&
-       img->aux.type == ILO_IMAGE_AUX_HIZ &&
-       img->separate_stencil)
-      return false;
-
-   /* MCS is required for multisample images */
-   if (img->aux.type == ILO_IMAGE_AUX_MCS &&
-       img->sample_count > 1)
-      return false;
-
-   img->aux.enables = 0x0;
 
    return true;
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index af15e856028..646ed6f5727 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -29,11 +29,17 @@
 #define ILO_IMAGE_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
+/*
+ * From the Ivy Bridge PRM, volume 4 part 1, page 75:
+ *
+ *     "(MIP Count / LOD) representing [1,15] MIP levels"
+ */
+#define ILO_IMAGE_MAX_LEVEL_COUNT 15
+
 enum ilo_image_aux_type {
    ILO_IMAGE_AUX_NONE,
    ILO_IMAGE_AUX_HIZ,
@@ -68,6 +74,49 @@ enum ilo_image_walk_type {
    ILO_IMAGE_WALK_3D,
 };
 
+struct ilo_image_info {
+   enum gen_surface_type type;
+
+   enum gen_surface_format format;
+   bool interleaved_stencil;
+   bool is_integer;
+   /* width, height and size of pixel blocks */
+   bool compressed;
+   unsigned block_width;
+   unsigned block_height;
+   unsigned block_size;
+
+   /* image size */
+   uint16_t width;
+   uint16_t height;
+   uint16_t depth;
+   uint16_t array_size;
+   uint8_t level_count;
+   uint8_t sample_count;
+
+   /* disable optional aux */
+   bool aux_disable;
+
+   /* tilings to consider, if any bit is set */
+   uint8_t valid_tilings;
+
+   /*
+    * prefer GEN6_TILING_NONE when the (estimated) image size exceeds the
+    * threshold
+    */
+   uint32_t prefer_linear_threshold;
+
+   /* force a stride when non-zero */
+   uint32_t force_bo_stride;
+
+   bool bind_surface_sampler;
+   bool bind_surface_dp_render;
+   bool bind_surface_dp_typed;
+   bool bind_zs;
+   bool bind_scanout;
+   bool bind_cursor;
+};
+
 /*
  * When the walk type is ILO_IMAGE_WALK_LAYER, there is only a slice in each
  * LOD and this is used to describe LODs in the first array layer.  Otherwise,
@@ -88,7 +137,10 @@ struct ilo_image_lod {
  * Texture layout.
  */
 struct ilo_image {
-   enum pipe_texture_target target;
+   enum gen_surface_type type;
+
+   enum gen_surface_format format;
+   bool interleaved_stencil;
 
    /* size, format, etc for programming hardware states */
    unsigned width0;
@@ -97,8 +149,6 @@ struct ilo_image {
    unsigned array_size;
    unsigned level_count;
    unsigned sample_count;
-   enum pipe_format format;
-   bool separate_stencil;
 
    /*
     * width, height, and size of pixel blocks for conversion between pixel
@@ -117,7 +167,7 @@ struct ilo_image {
    unsigned align_i;
    unsigned align_j;
 
-   struct ilo_image_lod lods[PIPE_MAX_TEXTURE_LEVELS];
+   struct ilo_image_lod lods[ILO_IMAGE_MAX_LEVEL_COUNT];
 
    /* physical layer height for ILO_IMAGE_WALK_LAYER */
    unsigned walk_layer_height;
@@ -136,36 +186,18 @@ struct ilo_image {
       unsigned enables;
 
       /* LOD offsets for ILO_IMAGE_WALK_LOD */
-      unsigned walk_lod_offsets[PIPE_MAX_TEXTURE_LEVELS];
+      unsigned walk_lod_offsets[ILO_IMAGE_MAX_LEVEL_COUNT];
 
       unsigned walk_layer_height;
       unsigned bo_stride;
       unsigned bo_height;
-
-      /* managed by users */
-      struct intel_bo *bo;
    } aux;
-
-   /* managed by users */
-   struct intel_bo *bo;
 };
 
-struct pipe_resource;
-
-void
+bool
 ilo_image_init(struct ilo_image *img,
                const struct ilo_dev *dev,
-               const struct pipe_resource *templ);
-
-bool
-ilo_image_init_for_imported(struct ilo_image *img,
-                            const struct ilo_dev *dev,
-                            const struct pipe_resource *templ,
-                            enum gen_surface_tiling tiling,
-                            unsigned bo_stride);
-
-bool
-ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev);
+               const struct ilo_image_info *info);
 
 static inline bool
 ilo_image_can_enable_aux(const struct ilo_image *img, unsigned level)
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.c b/src/gallium/drivers/ilo/core/ilo_state_sol.c
index 38c0b719ab3..6ef2c91a592 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sol.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.c
@@ -26,7 +26,7 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
+#include "ilo_vma.h"
 #include "ilo_state_sol.h"
 
 static bool
@@ -270,9 +270,6 @@ sol_buffer_validate_gen7(const struct ilo_dev *dev,
 {
    ILO_DEV_ASSERT(dev, 7, 8);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
-
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 208:
     *
@@ -281,9 +278,17 @@ sol_buffer_validate_gen7(const struct ilo_dev *dev,
     */
    assert(info->offset % 4 == 0);
 
+   if (info->vma) {
+      assert(info->vma->vm_alignment % 4 == 0);
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
+   }
+
    /* Gen8+ only */
-   if (info->write_offset_load || info->write_offset_save)
-      assert(ilo_dev_gen(dev) >= ILO_GEN(8));
+   if (info->write_offset_load || info->write_offset_save) {
+      assert(ilo_dev_gen(dev) >= ILO_GEN(8) && info->write_offset_vma);
+      assert(info->write_offset_offset + sizeof(uint32_t) <=
+            info->write_offset_vma->vm_size);
+   }
 
    /*
     * From the Broadwell PRM, volume 2b, page 206:
@@ -304,25 +309,15 @@ static uint32_t
 sol_buffer_get_gen6_size(const struct ilo_dev *dev,
                          const struct ilo_state_sol_buffer_info *info)
 {
-   uint32_t size;
-
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!info->buf)
-      return 0;
-
-   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
-
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 208:
     *
     *     "(Surface End Address) This field specifies the ending DWord
     *      address..."
     */
-   size &= ~3;
-
-   return size;
+   return (info->vma) ? info->size & ~3 : 0;
 }
 
 static bool
@@ -359,7 +354,7 @@ sol_buffer_set_gen8_3dstate_so_buffer(struct ilo_state_sol_buffer *sb,
 
    dw1 = 0;
 
-   if (info->buf)
+   if (info->vma)
       dw1 |= GEN8_SO_BUF_DW1_ENABLE;
    if (info->write_offset_load)
       dw1 |= GEN8_SO_BUF_DW1_OFFSET_WRITE_ENABLE;
@@ -429,6 +424,15 @@ ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
    return ilo_state_sol_init(sol, dev, &info);
 }
 
+uint32_t
+ilo_state_sol_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                          uint32_t *alignment)
+{
+   /* DWord aligned without padding */
+   *alignment = 4;
+   return size;
+}
+
 bool
 ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
                           const struct ilo_dev *dev,
@@ -443,9 +447,8 @@ ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
    else
       ret &= sol_buffer_set_gen7_3dstate_so_buffer(sb, dev, info);
 
-   sb->need_bo = (info->size > 0);
-   sb->need_write_offset_bo = (info->write_offset_save ||
-         (info->write_offset_load && !info->write_offset_imm_enable));
+   sb->vma = info->vma;
+   sb->write_offset_vma = info->write_offset_vma;
 
    assert(ret);
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.h b/src/gallium/drivers/ilo/core/ilo_state_sol.h
index 2513fcb4979..92c5f94725b 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_sol.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.h
@@ -107,17 +107,17 @@ struct ilo_state_sol {
    uint8_t decl_count;
 };
 
-struct ilo_buffer;
+struct ilo_vma;
 
 struct ilo_state_sol_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
-   /*
-    * Gen8+ only.  When enabled, require a write offset bo of at least
-    * (sizeof(uint32_t) * ILO_STATE_SOL_MAX_BUFFER_COUNT) bytes
-    */
+   /* Gen8+ only; at least sizeof(uint32_t) bytes */
+   const struct ilo_vma *write_offset_vma;
+   uint32_t write_offset_offset;
+
    bool write_offset_load;
    bool write_offset_save;
 
@@ -126,14 +126,10 @@ struct ilo_state_sol_buffer_info {
 };
 
 struct ilo_state_sol_buffer {
-   uint32_t so_buf[4];
+   uint32_t so_buf[5];
 
-   bool need_bo;
-   bool need_write_offset_bo;
-
-   /* managed by users */
-   struct intel_bo *bo;
-   struct intel_bo *write_offset_bo;
+   const struct ilo_vma *vma;
+   const struct ilo_vma *write_offset_vma;
 };
 
 static inline size_t
@@ -154,6 +150,10 @@ ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
                             const struct ilo_dev *dev,
                             bool render_disable);
 
+uint32_t
+ilo_state_sol_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                          uint32_t *alignment);
+
 bool
 ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
                           const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
index 5be9f8f6270..40fe15f316f 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -26,8 +26,8 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
 #include "ilo_image.h"
+#include "ilo_vma.h"
 #include "ilo_state_surface.h"
 
 static bool
@@ -94,17 +94,129 @@ surface_set_gen7_null_SURFACE_STATE(struct ilo_state_surface *surf,
    return true;
 }
 
+static uint32_t
+surface_get_gen6_buffer_offset_alignment(const struct ilo_dev *dev,
+                                         const struct ilo_state_surface_buffer_info *info)
+{
+   uint32_t alignment;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
+    *
+    *     "The Base Address for linear render target surfaces and surfaces
+    *      accessed with the typed surface read/write data port messages must
+    *      be element-size aligned, for non-YUV surface formats, or a multiple
+    *      of 2 element-sizes for YUV surface formats.  Other linear surfaces
+    *      have no alignment requirements (byte alignment is sufficient)."
+    *
+    *     "Certain message types used to access surfaces have more stringent
+    *      alignment requirements. Please refer to the specific message
+    *      documentation for additional restrictions."
+    */
+   switch (info->access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+      /* no alignment requirements */
+      alignment = 1;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      /* element-size aligned */
+      alignment = info->format_size;
+
+      assert(info->struct_size % alignment == 0);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+      /*
+       * Nothing is said about Untyped* messages, but I think they require the
+       * base address to be DWord aligned.
+       */
+      alignment = 4;
+
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 70:
+       *
+       *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the
+       *      pitch must be a multiple of 4 bytes."
+       */
+      if (info->struct_size > 1)
+         assert(info->struct_size % alignment == 0);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237:
+       *
+       *     "the surface base address must be OWord aligned"
+       *
+       * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord
+       * Dual Block Read/Write.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249:
+       *
+       *     "The surface base address must be DWord aligned"
+       *
+       * for DWord Scattered Read/Write and Byte Scattered Read/Write.
+       */
+      alignment = (info->format_size > 4) ? 16 : 4;
+
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, 237, and
+       * 246:
+       *
+       *     "the surface pitch is ignored, the surface is treated as a
+       *      1-dimensional surface. An element size (pitch) of 16 bytes is
+       *      used to determine the size of the buffer for out-of-bounds
+       *      checking if using the surface state model."
+       *
+       * for OWord Block Read/Write, Unaligned OWord Block Read, OWord
+       * Dual Block Read/Write, and DWord Scattered Read/Write.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 1, page 248:
+       *
+       *     "The surface pitch is ignored, the surface is treated as a
+       *      1-dimensional surface. An element size (pitch) of 4 bytes is
+       *      used to determine the size of the buffer for out-of-bounds
+       *      checking if using the surface state model."
+       *
+       * for Byte Scattered Read/Write.
+       *
+       * It is programmable on Gen7.5+.
+       */
+      if (ilo_dev_gen(dev) < ILO_GEN(7.5)) {
+         const int fixed = (info->format_size > 1) ? 16 : 4;
+         assert(info->struct_size == fixed);
+      }
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_SVB:
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 259:
+       *
+       *     "Both the surface base address and surface pitch must be DWord
+       *      aligned."
+       */
+      alignment = 4;
+
+      assert(info->struct_size % alignment == 0);
+      break;
+   default:
+      assert(!"unknown access");
+      alignment = 1;
+      break;
+   }
+
+   return alignment;
+}
+
 static bool
 surface_validate_gen6_buffer(const struct ilo_dev *dev,
                              const struct ilo_state_surface_buffer_info *info)
 {
+   uint32_t alignment;
+
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   /* SVB writes are Gen6-only */
-   if (ilo_dev_gen(dev) >= ILO_GEN(7))
-      assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB);
-
-   if (info->offset + info->size > info->buf->bo_size) {
+   if (info->offset + info->size > info->vma->vm_size) {
       ilo_warn("invalid buffer range\n");
       return false;
    }
@@ -120,87 +232,34 @@ surface_validate_gen6_buffer(const struct ilo_dev *dev,
       return false;
    }
 
+   alignment = surface_get_gen6_buffer_offset_alignment(dev, info);
+   if (info->offset % alignment || info->vma->vm_alignment % alignment) {
+      ilo_warn("bad buffer offset\n");
+      return false;
+   }
+
+   /* no STRBUF on Gen6 */
+   if (info->format == GEN6_FORMAT_RAW && info->struct_size > 1)
+      assert(ilo_dev_gen(dev) >= ILO_GEN(7));
+
+   /* SVB writes are Gen6 only */
+   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB)
+      assert(ilo_dev_gen(dev) == ILO_GEN(6));
+
    /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
+    * From the Ivy Bridge PRM, volume 4 part 1, page 83:
     *
-    *     "The Base Address for linear render target surfaces and surfaces
-    *      accessed with the typed surface read/write data port messages must
-    *      be element-size aligned, for non-YUV surface formats, or a multiple
-    *      of 2 element-sizes for YUV surface formats.  Other linear surfaces
-    *      have no alignment requirements (byte alignment is sufficient)."
+    *     "NOTE: "RAW" is supported only with buffers and structured buffers
+    *      accessed via the untyped surface read/write and untyped atomic
+    *      operation messages, which do not have a column in the table."
     *
-    *     "Certain message types used to access surfaces have more stringent
-    *      alignment requirements. Please refer to the specific message
-    *      documentation for additional restrictions."
+    * From the Ivy Bridge PRM, volume 4 part 1, page 252:
     *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237:
-    *
-    *     "the surface base address must be OWord aligned"
-    *
-    * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord Dual
-    * Block Read/Write.
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249:
-    *
-    *     "The surface base address must be DWord aligned"
-    *
-    * for DWord Scattered Read/Write and Byte Scattered Read/Write.
-    *
-    * We have to rely on users to correctly set info->struct_size here.  DWord
-    * Scattered Read/Write has conflicting pitch and alignment, but we do not
-    * use them yet so we are fine.
-    *
-    * It is unclear if sampling engine surfaces require aligned offsets.
+    *     "For untyped messages, the Surface Format must be RAW and the
+    *      Surface Type must be SURFTYPE_BUFFER or SURFTYPE_STRBUF."
     */
-   if (info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB) {
-      assert(info->struct_size % info->format_size == 0);
-
-      if (info->offset % info->struct_size) {
-         ilo_warn("bad buffer offset\n");
-         return false;
-      }
-   }
-
-   if (info->format == GEN6_FORMAT_RAW) {
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 97:
-       *
-       *     ""RAW" is supported only with buffers and structured buffers
-       *      accessed via the untyped surface read/write and untyped atomic
-       *      operation messages, which do not have a column in the table."
-       *
-       * We do not have a specific access mode for untyped messages.
-       */
-      assert(info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED);
-
-      /*
-       * Nothing is said about Untyped* messages, but I guess they require the
-       * base address to be DWord aligned.
-       */
-      if (info->offset % 4) {
-         ilo_warn("bad RAW buffer offset\n");
-         return false;
-      }
-
-      if (info->struct_size > 1) {
-         /* no STRBUF on Gen6 */
-         if (ilo_dev_gen(dev) == ILO_GEN(6)) {
-            ilo_warn("no STRBUF support\n");
-            return false;
-         }
-
-         /*
-          * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-          *
-          *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the
-          *      pitch must be a multiple of 4 bytes."
-          */
-         if (info->struct_size % 4) {
-            ilo_warn("bad STRBUF pitch\n");
-            return false;
-         }
-      }
-   }
+   assert((info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED) ==
+          (info->format == GEN6_FORMAT_RAW));
 
    return true;
 }
@@ -215,8 +274,7 @@ surface_get_gen6_buffer_struct_count(const struct ilo_dev *dev,
    ILO_DEV_ASSERT(dev, 6, 8);
 
    c = info->size / info->struct_size;
-   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB &&
-       info->format_size < info->size - info->struct_size * c)
+   if (info->format_size < info->size - info->struct_size * c)
       c++;
 
    /*
@@ -367,29 +425,6 @@ surface_set_gen7_buffer_SURFACE_STATE(struct ilo_state_surface *surf,
    return true;
 }
 
-static enum gen_surface_type
-get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   switch (img->target) {
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return GEN6_SURFTYPE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return GEN6_SURFTYPE_2D;
-   case PIPE_TEXTURE_3D:
-      return GEN6_SURFTYPE_3D;
-   default:
-      assert(!"unknown texture target");
-      return GEN6_SURFTYPE_NULL;
-   }
-}
-
 static bool
 surface_validate_gen6_image(const struct ilo_dev *dev,
                             const struct ilo_state_surface_image_info *info)
@@ -408,6 +443,17 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
       break;
    }
 
+   assert(info->img && info->vma);
+
+   if (info->img->tiling != GEN6_TILING_NONE)
+      assert(info->vma->vm_alignment % 4096 == 0);
+
+   if (info->aux_vma) {
+      assert(ilo_image_can_enable_aux(info->img, info->level_base));
+      /* always tiled */
+      assert(info->aux_vma->vm_alignment % 4096 == 0);
+   }
+
    /*
     * From the Sandy Bridge PRM, volume 4 part 1, page 78:
     *
@@ -418,17 +464,19 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
    assert(info->img->bo_stride && info->img->bo_stride <= 512 * 1024 &&
           info->img->width0 <= info->img->bo_stride);
 
-   if (info->is_cube_map) {
-      assert(get_gen6_surface_type(dev, info->img) == GEN6_SURFTYPE_2D);
-
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 78:
-       *
-       *     "For cube maps, Width must be set equal to the Height."
-       */
-      assert(info->img->width0 == info->img->height0);
+   if (info->type != info->img->type) {
+      assert(info->type == GEN6_SURFTYPE_2D &&
+             info->img->type == GEN6_SURFTYPE_CUBE);
    }
 
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 78:
+    *
+    *     "For cube maps, Width must be set equal to the Height."
+    */
+   if (info->type == GEN6_SURFTYPE_CUBE)
+      assert(info->img->width0 == info->img->height0);
+
    /*
     * From the Sandy Bridge PRM, volume 4 part 1, page 72:
     *
@@ -463,20 +511,21 @@ surface_validate_gen6_image(const struct ilo_dev *dev,
 }
 
 static void
-get_gen6_max_extent(const struct ilo_dev *dev,
-                    const struct ilo_image *img,
-                    uint16_t *max_w, uint16_t *max_h)
+surface_get_gen6_image_max_extent(const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_image_info *info,
+                                  uint16_t *max_w, uint16_t *max_h)
 {
    const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
       *max_w = max_size;
       *max_h = 1;
       break;
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       *max_w = max_size;
       *max_h = max_size;
       break;
@@ -504,7 +553,7 @@ surface_get_gen6_image_extent(const struct ilo_dev *dev,
    w = info->img->width0;
    h = info->img->height0;
 
-   get_gen6_max_extent(dev, info->img, &max_w, &max_h);
+   surface_get_gen6_image_max_extent(dev, info, &max_w, &max_h);
    assert(w && h && w <= max_w && h <= max_h);
 
    *width = w - 1;
@@ -555,16 +604,17 @@ surface_get_gen6_image_slices(const struct ilo_dev *dev,
     * layers to (86 * 6), about 512.
     */
 
-   switch (get_gen6_surface_type(dev, info->img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 2048 : 512;
 
       assert(info->img->array_size <= max_slice);
       max_slice = info->img->array_size;
 
       d = info->slice_count;
-      if (info->is_cube_map) {
+      if (info->type == GEN6_SURFTYPE_CUBE) {
          if (info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
             if (!d || d % 6) {
                ilo_warn("invalid cube slice count\n");
@@ -877,7 +927,6 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    uint8_t min_lod, mip_count;
    enum gen_sample_count sample_count;
    uint32_t alignments;
-   enum gen_surface_type type;
    uint32_t dw0, dw2, dw3, dw4, dw5;
 
    ILO_DEV_ASSERT(dev, 6, 6);
@@ -897,10 +946,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    if (info->img->sample_count > 1)
       assert(info->img->interleaved_samples);
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-      get_gen6_surface_type(dev, info->img);
-
-   dw0 = type << GEN6_SURFACE_DW0_TYPE__SHIFT |
+   dw0 = info->type << GEN6_SURFACE_DW0_TYPE__SHIFT |
          info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT |
          GEN6_SURFACE_DW0_MIPLAYOUT_BELOW;
 
@@ -927,7 +973,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
     *     "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this
     *      field must be programmed to 111111b (all faces enabled)."
     */
-   if (info->is_cube_map &&
+   if (info->type == GEN6_SURFTYPE_CUBE &&
        info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
       dw0 |= GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE |
              GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
@@ -956,7 +1002,7 @@ surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
    surf->surface[4] = dw4;
    surf->surface[5] = dw5;
 
-   surf->type = type;
+   surf->type = info->type;
    surf->min_lod = min_lod;
    surf->mip_count = mip_count;
 
@@ -972,7 +1018,6 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
    uint8_t min_lod, mip_count;
    uint32_t alignments;
    enum gen_sample_count sample_count;
-   enum gen_surface_type type;
    uint32_t dw0, dw1, dw2, dw3, dw4, dw5, dw7;
 
    ILO_DEV_ASSERT(dev, 7, 8);
@@ -986,10 +1031,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
        !surface_get_gen6_image_alignments(dev, info, &alignments))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-      get_gen6_surface_type(dev, info->img);
-
-   dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT |
+   dw0 = info->type << GEN7_SURFACE_DW0_TYPE__SHIFT |
          info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT |
          alignments;
 
@@ -1023,7 +1065,7 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
     *      field must be programmed to 111111b (all faces enabled). This field
     *      is ignored unless the Surface Type is SURFTYPE_CUBE."
     */
-   if (info->is_cube_map &&
+   if (info->type == GEN6_SURFTYPE_CUBE &&
        info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER)
       dw0 |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
 
@@ -1087,13 +1129,61 @@ surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
       surf->surface[12] = 0;
    }
 
-   surf->type = type;
+   surf->type = info->type;
    surf->min_lod = min_lod;
    surf->mip_count = mip_count;
 
    return true;
 }
 
+uint32_t
+ilo_state_surface_buffer_size(const struct ilo_dev *dev,
+                              enum ilo_state_surface_access access,
+                              uint32_t size, uint32_t *alignment)
+{
+   switch (access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+      /*
+       * From the Sandy Bridge PRM, volume 1 part 1, page 118:
+       *
+       *     "For buffers, which have no inherent "height," padding
+       *      requirements are different. A buffer must be padded to the next
+       *      multiple of 256 array elements, with an additional 16 bytes
+       *      added beyond that to account for the L1 cache line."
+       *
+       * Assuming tightly packed GEN6_FORMAT_R32G32B32A32_FLOAT, the size
+       * needs to be padded to 4096 (= 16 * 256).
+       */
+      *alignment = 1;
+      size = align(size, 4096) + 16;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      /* element-size aligned for worst cases */
+      *alignment = 16;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+      /* DWord aligned? */
+      *alignment = 4;
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      /* OWord aligned */
+      *alignment = 16;
+      size = align(size, 16);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_SVB:
+      /* always DWord aligned */
+      *alignment = 4;
+      break;
+   default:
+      assert(!"unknown access");
+      *alignment = 1;
+      break;
+   }
+
+   return size;
+}
+
 bool
 ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
                                 const struct ilo_dev *dev)
@@ -1107,6 +1197,7 @@ ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_null_SURFACE_STATE(surf, dev);
 
+   surf->vma = NULL;
    surf->type = GEN6_SURFTYPE_NULL;
    surf->readonly = true;
 
@@ -1129,6 +1220,7 @@ ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_buffer_SURFACE_STATE(surf, dev, info);
 
+   surf->vma = info->vma;
    surf->readonly = info->readonly;
 
    assert(ret);
@@ -1150,6 +1242,9 @@ ilo_state_surface_init_for_image(struct ilo_state_surface *surf,
    else
       ret &= surface_set_gen6_image_SURFACE_STATE(surf, dev, info);
 
+   surf->vma = info->vma;
+   surf->aux_vma = info->aux_vma;
+
    surf->is_integer = info->is_integer;
    surf->readonly = info->readonly;
    surf->scanout = info->img->scanout;
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h
index 9c025428d50..e78c7c97db1 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_surface.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h
@@ -29,14 +29,10 @@
 #define ILO_STATE_SURFACE_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
-struct ilo_buffer;
-struct ilo_image;
-
 enum ilo_state_surface_access {
    ILO_STATE_SURFACE_ACCESS_SAMPLER,      /* sampling engine surfaces */
    ILO_STATE_SURFACE_ACCESS_DP_RENDER,    /* render target surfaces */
@@ -46,42 +42,51 @@ enum ilo_state_surface_access {
    ILO_STATE_SURFACE_ACCESS_DP_SVB,
 };
 
+struct ilo_vma;
+struct ilo_image;
+
 struct ilo_state_surface_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
+   uint32_t offset;
+   uint32_t size;
 
    enum ilo_state_surface_access access;
 
+   /* format_size may be less than, equal to, or greater than struct_size */
    enum gen_surface_format format;
    uint8_t format_size;
 
    bool readonly;
    uint16_t struct_size;
-
-   uint32_t offset;
-   uint32_t size;
 };
 
 struct ilo_state_surface_image_info {
    const struct ilo_image *img;
+   uint8_t level_base;
+   uint8_t level_count;
+   uint16_t slice_base;
+   uint16_t slice_count;
+
+   const struct ilo_vma *vma;
+   const struct ilo_vma *aux_vma;
 
    enum ilo_state_surface_access access;
 
+   enum gen_surface_type type;
+
    enum gen_surface_format format;
    bool is_integer;
 
    bool readonly;
-   bool is_cube_map;
    bool is_array;
-
-   uint8_t level_base;
-   uint8_t level_count;
-   uint16_t slice_base;
-   uint16_t slice_count;
 };
 
 struct ilo_state_surface {
    uint32_t surface[13];
 
+   const struct ilo_vma *vma;
+   const struct ilo_vma *aux_vma;
+
    enum gen_surface_type type;
    uint8_t min_lod;
    uint8_t mip_count;
@@ -89,9 +94,6 @@ struct ilo_state_surface {
 
    bool readonly;
    bool scanout;
-
-   /* managed by users */
-   struct intel_bo *bo;
 };
 
 bool
@@ -99,6 +101,11 @@ ilo_state_surface_valid_format(const struct ilo_dev *dev,
                                enum ilo_state_surface_access access,
                                enum gen_surface_format format);
 
+uint32_t
+ilo_state_surface_buffer_size(const struct ilo_dev *dev,
+                              enum ilo_state_surface_access access,
+                              uint32_t size, uint32_t *alignment);
+
 bool
 ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
                                 const struct ilo_dev *dev);
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c
index ddc75428ed7..9faf835fef2 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c
@@ -26,7 +26,7 @@
  */
 
 #include "ilo_debug.h"
-#include "ilo_buffer.h"
+#include "ilo_vma.h"
 #include "ilo_state_vf.h"
 
 static bool
@@ -479,8 +479,8 @@ vertex_buffer_validate_gen6(const struct ilo_dev *dev,
 {
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
+   if (info->vma)
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
 
    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 86:
@@ -500,6 +500,9 @@ vertex_buffer_validate_gen6(const struct ilo_dev *dev,
     *      aligned address, and BufferPitch must be a multiple of 64-bits."
     */
    if (info->cv_has_double) {
+      if (info->vma)
+         assert(info->vma->vm_alignment % 8 == 0);
+
       assert(info->stride % 8 == 0);
       assert((info->offset + info->cv_double_vertex_offset_mod_8) % 8 == 0);
    }
@@ -512,12 +515,7 @@ vertex_buffer_get_gen6_size(const struct ilo_dev *dev,
                             const struct ilo_state_vertex_buffer_info *info)
 {
    ILO_DEV_ASSERT(dev, 6, 8);
-
-   if (!info->buf)
-      return 0;
-
-   return (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
+   return (info->vma) ? info->size : 0;
 }
 
 static bool
@@ -537,7 +535,7 @@ vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb,
 
    if (ilo_dev_gen(dev) >= ILO_GEN(7))
       dw0 |= GEN7_VB_DW0_ADDR_MODIFIED;
-   if (!info->buf)
+   if (!info->vma)
       dw0 |= GEN6_VB_DW0_IS_NULL;
 
    STATIC_ASSERT(ARRAY_SIZE(vb->vb) >= 3);
@@ -551,7 +549,7 @@ vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb,
       vb->vb[2] = (size) ? info->offset + size - 1 : 0;
    }
 
-   vb->need_bo = (info->buf != NULL);
+   vb->vma = info->vma;
 
    return true;
 }
@@ -586,8 +584,10 @@ index_buffer_validate_gen6(const struct ilo_dev *dev,
     */
    assert(info->offset % format_size == 0);
 
-   if (info->buf)
-      assert(info->offset < info->buf->bo_size && info->size);
+   if (info->vma) {
+      assert(info->vma->vm_alignment % format_size == 0);
+      assert(info->size && info->offset + info->size <= info->vma->vm_size);
+   }
 
    return true;
 }
@@ -600,12 +600,10 @@ index_buffer_get_gen6_size(const struct ilo_dev *dev,
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!info->buf)
+   if (!info->vma)
       return 0;
 
-   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
-      info->buf->bo_size - info->offset;
-
+   size = info->size;
    if (ilo_dev_gen(dev) < ILO_GEN(8)) {
       const uint32_t format_size = get_index_format_size(info->format);
       size -= (size % format_size);
@@ -638,7 +636,7 @@ index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(struct ilo_state_index_buffer *ib,
       ib->ib[2] = (size) ? info->offset + size - 1 : 0;
    }
 
-   ib->need_bo = (info->buf != NULL);
+   ib->vma = info->vma;
 
    return true;
 }
@@ -949,6 +947,15 @@ ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
    }
 }
 
+uint32_t
+ilo_state_vertex_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                             uint32_t *alignment)
+{
+   /* align for doubles without padding */
+   *alignment = 8;
+   return size;
+}
+
 /**
  * No need to initialize first.
  */
@@ -966,6 +973,15 @@ ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
    return ret;
 }
 
+uint32_t
+ilo_state_index_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                            uint32_t *alignment)
+{
+   /* align for the worst case without padding */
+   *alignment = get_index_format_size(GEN6_INDEX_DWORD);
+   return size;
+}
+
 /**
  * No need to initialize first.
  */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.h b/src/gallium/drivers/ilo/core/ilo_state_vf.h
index f15c63a248a..16b128bf63c 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_vf.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.h
@@ -126,10 +126,10 @@ struct ilo_state_vf_delta {
    uint32_t dirty;
 };
 
-struct ilo_buffer;
+struct ilo_vma;
 
 struct ilo_state_vertex_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
@@ -143,14 +143,11 @@ struct ilo_state_vertex_buffer_info {
 struct ilo_state_vertex_buffer {
    uint32_t vb[3];
 
-   bool need_bo;
-
-   /* managed by users */
-   struct intel_bo *bo;
+   const struct ilo_vma *vma;
 };
 
 struct ilo_state_index_buffer_info {
-   const struct ilo_buffer *buf;
+   const struct ilo_vma *vma;
    uint32_t offset;
    uint32_t size;
 
@@ -160,10 +157,7 @@ struct ilo_state_index_buffer_info {
 struct ilo_state_index_buffer {
    uint32_t ib[3];
 
-   bool need_bo;
-
-   /* managed by users */
-   struct intel_bo *bo;
+   const struct ilo_vma *vma;
 };
 
 static inline size_t
@@ -215,11 +209,19 @@ ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
                        const struct ilo_state_vf *old,
                        struct ilo_state_vf_delta *delta);
 
+uint32_t
+ilo_state_vertex_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                             uint32_t *alignment);
+
 bool
 ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
                                  const struct ilo_dev *dev,
                                  const struct ilo_state_vertex_buffer_info *info);
 
+uint32_t
+ilo_state_index_buffer_size(const struct ilo_dev *dev, uint32_t size,
+                            uint32_t *alignment);
+
 bool
 ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib,
                                 const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c
index 901fedb5599..827632764b2 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c
@@ -25,10 +25,9 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
-#include "intel_winsys.h"
-
 #include "ilo_debug.h"
 #include "ilo_image.h"
+#include "ilo_vma.h"
 #include "ilo_state_zs.h"
 
 static bool
@@ -56,70 +55,9 @@ zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = 0;
    zs->depth[4] = 0;
 
-   zs->depth_format = format;
-
    return true;
 }
 
-static enum gen_surface_type
-get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   switch (img->target) {
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return GEN6_SURFTYPE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_2D_ARRAY:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return GEN6_SURFTYPE_2D;
-   case PIPE_TEXTURE_3D:
-      return GEN6_SURFTYPE_3D;
-   default:
-      assert(!"unknown texture target");
-      return GEN6_SURFTYPE_NULL;
-   }
-}
-
-static enum gen_depth_format
-get_gen6_depth_format(const struct ilo_dev *dev, const struct ilo_image *img)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      switch (img->format) {
-      case PIPE_FORMAT_Z32_FLOAT:
-         return GEN6_ZFORMAT_D32_FLOAT;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
-      case PIPE_FORMAT_Z16_UNORM:
-         return GEN6_ZFORMAT_D16_UNORM;
-      default:
-         assert(!"unknown depth format");
-         return GEN6_ZFORMAT_D32_FLOAT;
-      }
-   } else {
-      switch (img->format) {
-      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-         return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
-      case PIPE_FORMAT_Z32_FLOAT:
-         return GEN6_ZFORMAT_D32_FLOAT;
-      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-         return GEN6_ZFORMAT_D24_UNORM_S8_UINT;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
-      case PIPE_FORMAT_Z16_UNORM:
-         return GEN6_ZFORMAT_D16_UNORM;
-      default:
-         assert(!"unknown depth format");
-         return GEN6_ZFORMAT_D32_FLOAT;
-      }
-   }
-}
-
 static bool
 zs_validate_gen6(const struct ilo_dev *dev,
                  const struct ilo_state_zs_info *info)
@@ -128,63 +66,102 @@ zs_validate_gen6(const struct ilo_dev *dev,
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
+   assert(!info->z_img == !info->z_vma);
+   assert(!info->s_img == !info->s_vma);
+
+   /* all tiled */
+   if (info->z_img) {
+      assert(info->z_img->tiling == GEN6_TILING_Y);
+      assert(info->z_vma->vm_alignment % 4096 == 0);
+   }
+   if (info->s_img) {
+      assert(info->s_img->tiling == GEN8_TILING_W);
+      assert(info->s_vma->vm_alignment % 4096 == 0);
+   }
+   if (info->hiz_vma) {
+      assert(info->z_img &&
+             ilo_image_can_enable_aux(info->z_img, info->level));
+      assert(info->z_vma->vm_alignment % 4096 == 0);
+   }
+
    /*
     * From the Ivy Bridge PRM, volume 2 part 1, page 315:
     *
-    *      The stencil buffer has a format of S8_UINT, and shares Surface
+    *     "The stencil buffer has a format of S8_UINT, and shares Surface
     *      Type, Height, Width, and Depth, Minimum Array Element, Render
     *      Target View Extent, Depth Coordinate Offset X/Y, LOD, and Depth
-    *      Buffer Object Control State fields of the depth buffer.
+    *      Buffer Object Control State fields of the depth buffer."
     */
-   if (info->z_img == info->s_img) {
-      assert(info->z_img->target == info->s_img->target &&
-             info->z_img->width0 == info->s_img->width0 &&
+   if (info->z_img && info->s_img && info->z_img != info->s_img) {
+      assert(info->z_img->type == info->s_img->type &&
              info->z_img->height0 == info->s_img->height0 &&
              info->z_img->depth0 == info->s_img->depth0);
    }
 
+   if (info->type != img->type) {
+      assert(info->type == GEN6_SURFTYPE_2D &&
+             img->type == GEN6_SURFTYPE_CUBE);
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (info->format) {
+      case GEN6_ZFORMAT_D32_FLOAT:
+      case GEN6_ZFORMAT_D24_UNORM_X8_UINT:
+      case GEN6_ZFORMAT_D16_UNORM:
+         break;
+      default:
+         assert(!"unknown depth format");
+         break;
+      }
+   } else {
+      /*
+       * From the Ironlake PRM, volume 2 part 1, page 330:
+       *
+       *     "If this field (Separate Stencil Buffer Enable) is disabled, the
+       *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
+       *
+       * From the Sandy Bridge PRM, volume 2 part 1, page 321:
+       *
+       *     "[DevSNB]: This field (Separate Stencil Buffer Enable) must be
+       *      set to the same value (enabled or disabled) as Hierarchical
+       *      Depth Buffer Enable."
+       */
+      if (info->hiz_vma)
+         assert(info->format != GEN6_ZFORMAT_D24_UNORM_S8_UINT);
+      else
+         assert(info->format != GEN6_ZFORMAT_D24_UNORM_X8_UINT);
+   }
+
    assert(info->level < img->level_count);
    assert(img->bo_stride);
 
-   if (info->hiz_enable) {
-      assert(info->z_img &&
-             ilo_image_can_enable_aux(info->z_img, info->level));
-   }
-
-   if (info->is_cube_map) {
-      assert(get_gen6_surface_type(dev, img) == GEN6_SURFTYPE_2D);
-
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 323:
-       *
-       *     "For cube maps, Width must be set equal to Height."
-       */
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 323:
+    *
+    *     "For cube maps, Width must be set equal to Height."
+    */
+   if (info->type == GEN6_SURFTYPE_CUBE)
       assert(img->width0 == img->height0);
-   }
-
-   if (info->z_img)
-      assert(info->z_img->tiling == GEN6_TILING_Y);
-   if (info->s_img)
-      assert(info->s_img->tiling == GEN8_TILING_W);
 
    return true;
 }
 
 static void
-get_gen6_max_extent(const struct ilo_dev *dev,
-                    const struct ilo_image *img,
-                    uint16_t *max_w, uint16_t *max_h)
+zs_get_gen6_max_extent(const struct ilo_dev *dev,
+                       const struct ilo_state_zs_info *info,
+                       uint16_t *max_w, uint16_t *max_h)
 {
    const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
       *max_w = max_size;
       *max_h = 1;
       break;
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       *max_w = max_size;
       *max_h = max_size;
       break;
@@ -274,7 +251,7 @@ zs_get_gen6_depth_extent(const struct ilo_dev *dev,
    w = img->width0;
    h = img->height0;
 
-   if (info->hiz_enable) {
+   if (info->hiz_vma) {
       uint16_t align_w, align_h;
 
       get_gen6_hiz_alignments(dev, info->z_img, &align_w, &align_h);
@@ -290,7 +267,7 @@ zs_get_gen6_depth_extent(const struct ilo_dev *dev,
       h = align(h, align_h);
    }
 
-   get_gen6_max_extent(dev, img, &max_w, &max_h);
+   zs_get_gen6_max_extent(dev, info, &max_w, &max_h);
    assert(w && h && w <= max_w && h <= max_h);
 
    *width = w - 1;
@@ -319,16 +296,17 @@ zs_get_gen6_depth_slices(const struct ilo_dev *dev,
     *      surfaces. If the volume texture is MIP-mapped, this field specifies
     *      the depth of the base MIP level."
     */
-   switch (get_gen6_surface_type(dev, img)) {
+   switch (info->type) {
    case GEN6_SURFTYPE_1D:
    case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_CUBE:
       max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512;
 
       assert(img->array_size <= max_slice);
       max_slice = img->array_size;
 
       d = info->slice_count;
-      if (info->is_cube_map) {
+      if (info->type == GEN6_SURFTYPE_CUBE) {
          /*
           * Minumum Array Element and Depth must be 0; Render Target View
           * Extent is ignored.
@@ -408,8 +386,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  const struct ilo_state_zs_info *info)
 {
    uint16_t width, height, depth, array_base, view_extent;
-   enum gen_surface_type type;
-   enum gen_depth_format format;
    uint32_t dw1, dw2, dw3, dw4;
 
    ILO_DEV_ASSERT(dev, 6, 6);
@@ -420,37 +396,15 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  &view_extent))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
-                          get_gen6_surface_type(dev, info->s_img);
-
-   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
-      GEN6_ZFORMAT_D32_FLOAT;
-
-   /*
-    * From the Ironlake PRM, volume 2 part 1, page 330:
-    *
-    *     "If this field (Separate Stencil Buffer Enable) is disabled, the
-    *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 321:
-    *
-    *     "[DevSNB]: This field (Separate Stencil Buffer Enable) must be set
-    *      to the same value (enabled or disabled) as Hierarchical Depth
-    *      Buffer Enable."
-    */
-   if (!info->hiz_enable && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT)
-      format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
-
    /* info->z_readonly and info->s_readonly are ignored on Gen6 */
-   dw1 = type << GEN6_DEPTH_DW1_TYPE__SHIFT |
+   dw1 = info->type << GEN6_DEPTH_DW1_TYPE__SHIFT |
          GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT |
-         format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
+         info->format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
 
    if (info->z_img)
       dw1 |= (info->z_img->bo_stride - 1) << GEN6_DEPTH_DW1_PITCH__SHIFT;
 
-   if (info->hiz_enable || !info->z_img) {
+   if (info->hiz_vma || !info->z_img) {
       dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE |
              GEN6_DEPTH_DW1_SEPARATE_STENCIL;
    }
@@ -471,8 +425,6 @@ zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = dw4;
    zs->depth[4] = 0;
 
-   zs->depth_format = format;
-
    return true;
 }
 
@@ -481,8 +433,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  const struct ilo_dev *dev,
                                  const struct ilo_state_zs_info *info)
 {
-   enum gen_surface_type type;
-   enum gen_depth_format format;
    uint16_t width, height, depth;
    uint16_t array_base, view_extent;
    uint32_t dw1, dw2, dw3, dw4, dw6;
@@ -495,20 +445,13 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
                                  &view_extent))
       return false;
 
-   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
-          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
-                          get_gen6_surface_type(dev, info->s_img);
-
-   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
-      GEN6_ZFORMAT_D32_FLOAT;
-
-   dw1 = type << GEN7_DEPTH_DW1_TYPE__SHIFT |
-         format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
+   dw1 = info->type << GEN7_DEPTH_DW1_TYPE__SHIFT |
+         info->format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
 
    if (info->z_img) {
       if (!info->z_readonly)
          dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE;
-      if (info->hiz_enable)
+      if (info->hiz_vma)
          dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE;
 
       dw1 |= (info->z_img->bo_stride - 1) << GEN7_DEPTH_DW1_PITCH__SHIFT;
@@ -539,8 +482,6 @@ zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
    zs->depth[3] = dw4;
    zs->depth[4] = dw6;
 
-   zs->depth_format = format;
-
    return true;
 }
 
@@ -683,11 +624,15 @@ ilo_state_zs_init(struct ilo_state_zs *zs, const struct ilo_dev *dev,
    else
       ret &= zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(zs, dev);
 
-   if (info->z_img && info->hiz_enable)
+   if (info->z_img && info->hiz_vma)
       ret &= zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(zs, dev, info);
    else
       ret &= zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
 
+   zs->z_vma = info->z_vma;
+   zs->s_vma = info->s_vma;
+   zs->hiz_vma = info->hiz_vma;
+
    zs->z_readonly = info->z_readonly;
    zs->s_readonly = info->s_readonly;
 
@@ -703,6 +648,8 @@ ilo_state_zs_init_for_null(struct ilo_state_zs *zs,
    struct ilo_state_zs_info info;
 
    memset(&info, 0, sizeof(info));
+   info.type = GEN6_SURFTYPE_NULL;
+   info.format = GEN6_ZFORMAT_D32_FLOAT;
 
    return ilo_state_zs_init(zs, dev, &info);
 }
@@ -720,8 +667,11 @@ ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
     */
    assert(ilo_dev_gen(dev) >= ILO_GEN(7));
 
-   zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;
-   zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+   if (zs->hiz_vma) {
+      zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;
+      zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+      zs->hiz_vma = NULL;
+   }
 
    return true;
 }
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h
index 98212daf74f..6a25a873897 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_zs.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h
@@ -29,28 +29,31 @@
 #define ILO_STATE_ZS_H
 
 #include "genhw/genhw.h"
-#include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
 
+struct ilo_vma;
 struct ilo_image;
 
 struct ilo_state_zs_info {
-   /* both are optional */
+   /* both optional */
    const struct ilo_image *z_img;
    const struct ilo_image *s_img;
+   uint8_t level;
+   uint16_t slice_base;
+   uint16_t slice_count;
+
+   const struct ilo_vma *z_vma;
+   const struct ilo_vma *s_vma;
+   const struct ilo_vma *hiz_vma;
+
+   enum gen_surface_type type;
+   enum gen_depth_format format;
 
    /* ignored prior to Gen7 */
    bool z_readonly;
    bool s_readonly;
-
-   bool hiz_enable;
-   bool is_cube_map;
-
-   uint8_t level;
-   uint16_t slice_base;
-   uint16_t slice_count;
 };
 
 struct ilo_state_zs {
@@ -58,16 +61,12 @@ struct ilo_state_zs {
    uint32_t stencil[3];
    uint32_t hiz[3];
 
-   /* TODO move this to ilo_image */
-   enum gen_depth_format depth_format;
+   const struct ilo_vma *z_vma;
+   const struct ilo_vma *s_vma;
+   const struct ilo_vma *hiz_vma;
 
    bool z_readonly;
    bool s_readonly;
-
-   /* managed by users */
-   struct intel_bo *depth_bo;
-   struct intel_bo *stencil_bo;
-   struct intel_bo *hiz_bo;
 };
 
 bool
@@ -83,11 +82,4 @@ bool
 ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
                          const struct ilo_dev *dev);
 
-static inline enum gen_depth_format
-ilo_state_zs_get_depth_format(const struct ilo_state_zs *zs,
-                              const struct ilo_dev *dev)
-{
-   return zs->depth_format;
-}
-
 #endif /* ILO_STATE_ZS_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_buffer.h b/src/gallium/drivers/ilo/core/ilo_vma.h
similarity index 60%
rename from src/gallium/drivers/ilo/core/ilo_buffer.h
rename to src/gallium/drivers/ilo/core/ilo_vma.h
index ca3c61ff890..ad2a1d4b33e 100644
--- a/src/gallium/drivers/ilo/core/ilo_buffer.h
+++ b/src/gallium/drivers/ilo/core/ilo_vma.h
@@ -1,7 +1,7 @@
 /*
  * Mesa 3-D graphics library
  *
- * Copyright (C) 2012-2013 LunarG, Inc.
+ * Copyright (C) 2015 LunarG, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -25,40 +25,49 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
-#ifndef ILO_BUFFER_H
-#define ILO_BUFFER_H
-
-#include "intel_winsys.h"
+#ifndef ILO_VMA_H
+#define ILO_VMA_H
 
 #include "ilo_core.h"
 #include "ilo_debug.h"
 #include "ilo_dev.h"
 
-struct ilo_buffer {
-   unsigned bo_size;
+struct intel_bo;
 
-   /* managed by users */
+/**
+ * A virtual memory area.
+ */
+struct ilo_vma {
+   /* address space */
+   uint32_t vm_size;
+   uint32_t vm_alignment;
+
+   /* backing storage */
    struct intel_bo *bo;
+   uint32_t bo_offset;
 };
 
-static inline void
-ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev,
-                unsigned size, uint32_t bind, uint32_t flags)
+static inline bool
+ilo_vma_init(struct ilo_vma *vma, const struct ilo_dev *dev,
+             uint32_t size, uint32_t alignment)
 {
-   assert(ilo_is_zeroed(buf, sizeof(*buf)));
+   assert(ilo_is_zeroed(vma, sizeof(*vma)));
+   assert(size && alignment);
 
-   buf->bo_size = size;
+   vma->vm_alignment = alignment;
+   vma->vm_size = size;
 
-   /*
-    * From the Sandy Bridge PRM, volume 1 part 1, page 118:
-    *
-    *     "For buffers, which have no inherent "height," padding requirements
-    *      are different. A buffer must be padded to the next multiple of 256
-    *      array elements, with an additional 16 bytes added beyond that to
-    *      account for the L1 cache line."
-    */
-   if (bind & PIPE_BIND_SAMPLER_VIEW)
-      buf->bo_size = align(buf->bo_size, 256) + 16;
+   return true;
 }
 
-#endif /* ILO_BUFFER_H */
+static inline void
+ilo_vma_set_bo(struct ilo_vma *vma, const struct ilo_dev *dev,
+               struct intel_bo *bo, uint32_t offset)
+{
+   assert(offset % vma->vm_alignment == 0);
+
+   vma->bo = bo;
+   vma->bo_offset = offset;
+}
+
+#endif /* ILO_VMA_H */
diff --git a/src/gallium/drivers/ilo/ilo_blitter_blt.c b/src/gallium/drivers/ilo/ilo_blitter_blt.c
index d55dc35e360..66203e86137 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_blt.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_blt.c
@@ -127,7 +127,7 @@ ilo_blitter_blt_end(struct ilo_blitter *blitter, uint32_t swctrl)
 
 static bool
 buf_clear_region(struct ilo_blitter *blitter,
-                 struct ilo_buffer *buf, unsigned offset,
+                 struct ilo_buffer_resource *buf, unsigned offset,
                  uint32_t val, unsigned size,
                  enum gen6_blt_mask value_mask,
                  enum gen6_blt_mask write_mask)
@@ -140,8 +140,8 @@ buf_clear_region(struct ilo_blitter *blitter,
    if (offset % cpp || size % cpp)
       return false;
 
-   dst.bo = buf->bo;
-   dst.offset = offset;
+   dst.bo = buf->vma.bo;
+   dst.offset = buf->vma.bo_offset + offset;
 
    ilo_blitter_blt_begin(blitter, GEN6_COLOR_BLT__SIZE *
          (1 + size / 32764 / gen6_blt_max_scanlines),
@@ -179,25 +179,26 @@ buf_clear_region(struct ilo_blitter *blitter,
 
 static bool
 buf_copy_region(struct ilo_blitter *blitter,
-                struct ilo_buffer *dst_buf, unsigned dst_offset,
-                struct ilo_buffer *src_buf, unsigned src_offset,
+                struct ilo_buffer_resource *dst_buf, unsigned dst_offset,
+                struct ilo_buffer_resource *src_buf, unsigned src_offset,
                 unsigned size)
 {
    const uint8_t rop = 0xcc; /* SRCCOPY */
    struct ilo_builder *builder = &blitter->ilo->cp->builder;
    struct gen6_blt_bo dst, src;
 
-   dst.bo = dst_buf->bo;
-   dst.offset = dst_offset;
+   dst.bo = dst_buf->vma.bo;
+   dst.offset = dst_buf->vma.bo_offset + dst_offset;
    dst.pitch = 0;
 
-   src.bo = src_buf->bo;
-   src.offset = src_offset;
+   src.bo = src_buf->vma.bo;
+   src.offset = src_buf->vma.bo_offset + src_offset;
    src.pitch = 0;
 
    ilo_blitter_blt_begin(blitter, GEN6_SRC_COPY_BLT__SIZE *
          (1 + size / 32764 / gen6_blt_max_scanlines),
-         dst_buf->bo, GEN6_TILING_NONE, src_buf->bo, GEN6_TILING_NONE);
+         dst_buf->vma.bo, GEN6_TILING_NONE,
+         src_buf->vma.bo, GEN6_TILING_NONE);
 
    while (size) {
       unsigned width, height;
@@ -258,14 +259,14 @@ tex_clear_region(struct ilo_blitter *blitter,
    if (dst_box->width * cpp > gen6_blt_max_bytes_per_scanline)
       return false;
 
-   dst.bo = dst_tex->image.bo;
-   dst.offset = 0;
+   dst.bo = dst_tex->vma.bo;
+   dst.offset = dst_tex->vma.bo_offset;
    dst.pitch = dst_tex->image.bo_stride;
    dst.tiling = dst_tex->image.tiling;
 
    swctrl = ilo_blitter_blt_begin(blitter,
          GEN6_XY_COLOR_BLT__SIZE * dst_box->depth,
-         dst_tex->image.bo, dst_tex->image.tiling, NULL, GEN6_TILING_NONE);
+         dst_tex->vma.bo, dst_tex->image.tiling, NULL, GEN6_TILING_NONE);
 
    for (slice = 0; slice < dst_box->depth; slice++) {
       unsigned x, y;
@@ -299,7 +300,7 @@ tex_copy_region(struct ilo_blitter *blitter,
                 const struct pipe_box *src_box)
 {
    const struct util_format_description *desc =
-      util_format_description(dst_tex->image.format);
+      util_format_description(dst_tex->image_format);
    const unsigned max_extent = 32767; /* INT16_MAX */
    const uint8_t rop = 0xcc; /* SRCCOPY */
    struct ilo_builder *builder = &blitter->ilo->cp->builder;
@@ -347,13 +348,13 @@ tex_copy_region(struct ilo_blitter *blitter,
       break;
    }
 
-   dst.bo = dst_tex->image.bo;
-   dst.offset = 0;
+   dst.bo = dst_tex->vma.bo;
+   dst.offset = dst_tex->vma.bo_offset;
    dst.pitch = dst_tex->image.bo_stride;
    dst.tiling = dst_tex->image.tiling;
 
-   src.bo = src_tex->image.bo;
-   src.offset = 0;
+   src.bo = src_tex->vma.bo;
+   src.offset = src_tex->vma.bo_offset;
    src.pitch = src_tex->image.bo_stride;
    src.tiling = src_tex->image.tiling;
 
@@ -423,8 +424,8 @@ ilo_blitter_blt_copy_resource(struct ilo_blitter *blitter,
              src_box->height == 1 &&
              src_box->depth == 1);
 
-      success = buf_copy_region(blitter,
-            ilo_buffer(dst), dst_offset, ilo_buffer(src), src_offset, size);
+      success = buf_copy_region(blitter, ilo_buffer_resource(dst), dst_offset,
+            ilo_buffer_resource(src), src_offset, size);
    }
    else if (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER) {
       success = tex_copy_region(blitter,
@@ -488,7 +489,7 @@ ilo_blitter_blt_clear_rt(struct ilo_blitter *blitter,
       if (offset + size > end)
          size = end - offset;
 
-      success = buf_clear_region(blitter, ilo_buffer(rt->texture),
+      success = buf_clear_region(blitter, ilo_buffer_resource(rt->texture),
             offset, packed.ui[0], size, mask, mask);
    }
    else {
diff --git a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
index 13c8f500680..86e67084d6e 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
@@ -318,7 +318,7 @@ hiz_can_clear_zs(const struct ilo_blitter *blitter,
     * The truth is when HiZ is enabled, separate stencil is also enabled on
     * all GENs.  The depth buffer format cannot be combined depth/stencil.
     */
-   switch (tex->image.format) {
+   switch (tex->image_format) {
    case PIPE_FORMAT_Z16_UNORM:
       if (ilo_dev_gen(blitter->ilo->dev) == ILO_GEN(6) &&
           tex->base.width0 % 16)
@@ -355,7 +355,7 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
    if (ilo_dev_gen(blitter->ilo->dev) >= ILO_GEN(8))
       clear_value = fui(depth);
    else
-      clear_value = util_pack_z(tex->image.format, depth);
+      clear_value = util_pack_z(tex->image_format, depth);
 
    ilo_blit_resolve_surface(blitter->ilo, zs,
          ILO_TEXTURE_RENDER_WRITE | ILO_TEXTURE_CLEAR);
diff --git a/src/gallium/drivers/ilo/ilo_common.h b/src/gallium/drivers/ilo/ilo_common.h
index 9ebbf76e81e..3dbe79fb872 100644
--- a/src/gallium/drivers/ilo/ilo_common.h
+++ b/src/gallium/drivers/ilo/ilo_common.h
@@ -28,6 +28,14 @@
 #ifndef ILO_COMMON_H
 #define ILO_COMMON_H
 
+#include "pipe/p_format.h"
+#include "pipe/p_defines.h"
+
+#include "util/list.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_pointer.h"
+
 #include "core/ilo_core.h"
 #include "core/ilo_debug.h"
 #include "core/ilo_dev.h"
diff --git a/src/gallium/drivers/ilo/ilo_context.c b/src/gallium/drivers/ilo/ilo_context.c
index 3d5c7b636a8..b9a16aab81d 100644
--- a/src/gallium/drivers/ilo/ilo_context.c
+++ b/src/gallium/drivers/ilo/ilo_context.c
@@ -62,6 +62,8 @@ ilo_flush(struct pipe_context *pipe,
          (flags & PIPE_FLUSH_END_OF_FRAME) ? "frame end" : "user request");
 
    if (f) {
+      struct pipe_screen *screen = pipe->screen;
+      screen->fence_reference(screen, f, NULL);
       *f = ilo_screen_fence_create(pipe->screen, ilo->cp->last_submitted_bo);
    }
 }
diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c
index e8e1a4cd14c..433348d9326 100644
--- a/src/gallium/drivers/ilo/ilo_draw.c
+++ b/src/gallium/drivers/ilo/ilo_draw.c
@@ -444,6 +444,7 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
                          const struct pipe_draw_info *info)
 {
    const struct ilo_ib_state *ib = &ilo->state_vector.ib;
+   const struct ilo_vma *vma;
    union {
       const void *ptr;
       const uint8_t *u8;
@@ -453,10 +454,12 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
 
    /* we will draw with IB mapped */
    if (ib->state.buffer) {
-      u.ptr = intel_bo_map(ilo_buffer(ib->state.buffer)->bo, false);
+      vma = ilo_resource_get_vma(ib->state.buffer);
+      u.ptr = intel_bo_map(vma->bo, false);
       if (u.ptr)
-         u.u8 += ib->state.offset;
+         u.u8 += vma->bo_offset + ib->state.offset;
    } else {
+      vma = NULL;
       u.ptr = ib->state.user_buffer;
    }
 
@@ -500,8 +503,8 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
 
 #undef DRAW_VBO_WITH_SW_RESTART
 
-   if (ib->state.buffer)
-      intel_bo_unmap(ilo_buffer(ib->state.buffer)->bo);
+   if (vma)
+      intel_bo_unmap(vma->bo);
 }
 
 static bool
diff --git a/src/gallium/drivers/ilo/ilo_format.h b/src/gallium/drivers/ilo/ilo_format.h
index 4e955c09c14..0a19c02659e 100644
--- a/src/gallium/drivers/ilo/ilo_format.h
+++ b/src/gallium/drivers/ilo/ilo_format.h
@@ -165,4 +165,39 @@ ilo_format_translate_vertex(const struct ilo_dev *dev,
    return ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER);
 }
 
+static inline enum gen_depth_format
+ilo_format_translate_depth(const struct ilo_dev *dev,
+                           enum pipe_format format)
+{
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (format) {
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   } else {
+      switch (format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         return GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   }
+}
+
 #endif /* ILO_FORMAT_H */
diff --git a/src/gallium/drivers/ilo/ilo_render_surface.c b/src/gallium/drivers/ilo/ilo_render_surface.c
index ad053564294..3bf8646b344 100644
--- a/src/gallium/drivers/ilo/ilo_render_surface.c
+++ b/src/gallium/drivers/ilo/ilo_render_surface.c
@@ -42,14 +42,17 @@ gen6_so_SURFACE_STATE(struct ilo_builder *builder,
                       const struct pipe_stream_output_info *so_info,
                       int so_index)
 {
-   struct ilo_buffer *buf = ilo_buffer(so->buffer);
    struct ilo_state_surface_buffer_info info;
    struct ilo_state_surface surf;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
    memset(&info, 0, sizeof(info));
-   info.buf = buf;
+
+   info.vma = ilo_resource_get_vma(so->buffer);
+   info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
+   info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4;
+
    info.access = ILO_STATE_SURFACE_ACCESS_DP_SVB;
 
    switch (so_info->output[so_index].num_components) {
@@ -78,12 +81,9 @@ gen6_so_SURFACE_STATE(struct ilo_builder *builder,
 
    info.struct_size =
       so_info->stride[so_info->output[so_index].output_buffer] * 4;
-   info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
-   info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4;
 
    memset(&surf, 0, sizeof(surf));
    ilo_state_surface_init_for_buffer(&surf, builder->dev, &info);
-   surf.bo = info.buf->bo;
 
    return gen6_SURFACE_STATE(builder, &surf);
 }
@@ -482,18 +482,19 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r,
       return;
 
    memset(&info, 0, sizeof(info));
-   info.buf = ilo_buffer(session->input->buffer);
+
+   info.vma = ilo_resource_get_vma(session->input->buffer);
+   info.offset = session->input->buffer_offset;
+   info.size = session->input->buffer_size;
+
    info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
    info.format = GEN6_FORMAT_RAW;
    info.format_size = 1;
    info.struct_size = 1;
    info.readonly = true;
-   info.offset = session->input->buffer_offset;
-   info.size = session->input->buffer_size;
 
    memset(&surf, 0, sizeof(surf));
    ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
-   surf.bo = info.buf->bo;
 
    assert(count == 1 && session->input->buffer);
    surface_state[base] = gen6_SURFACE_STATE(r->builder, &surf);
@@ -538,23 +539,23 @@ gen6_emit_launch_grid_surface_global(struct ilo_render *r,
    surface_state += base;
    for (i = 0; i < count; i++) {
       if (i < vec->global_binding.count && bindings[i].resource) {
-         const struct ilo_buffer *buf = ilo_buffer(bindings[i].resource);
          struct ilo_state_surface_buffer_info info;
          struct ilo_state_surface surf;
 
          assert(bindings[i].resource->target == PIPE_BUFFER);
 
          memset(&info, 0, sizeof(info));
-         info.buf = buf;
+
+         info.vma = ilo_resource_get_vma(bindings[i].resource);
+         info.size = info.vma->vm_size;
+
          info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
          info.format = GEN6_FORMAT_RAW;
          info.format_size = 1;
          info.struct_size = 1;
-         info.size = buf->bo_size;
 
          memset(&surf, 0, sizeof(surf));
          ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
-         surf.bo = info.buf->bo;
 
          surface_state[i] = gen6_SURFACE_STATE(r->builder, &surf);
       } else {
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index be9fd10a84c..9026ba9a983 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -25,7 +25,12 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
+#include "core/ilo_state_vf.h"
+#include "core/ilo_state_sol.h"
+#include "core/ilo_state_surface.h"
+
 #include "ilo_screen.h"
+#include "ilo_format.h"
 #include "ilo_resource.h"
 
 /*
@@ -83,6 +88,134 @@ resource_get_cpu_init(const struct pipe_resource *templ)
                           PIPE_BIND_STREAM_OUTPUT)) ? false : true;
 }
 
+static enum gen_surface_type
+get_surface_type(enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return GEN6_SURFTYPE_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_2D_ARRAY:
+      return GEN6_SURFTYPE_2D;
+   case PIPE_TEXTURE_3D:
+      return GEN6_SURFTYPE_3D;
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return GEN6_SURFTYPE_CUBE;
+   default:
+      assert(!"unknown texture target");
+      return GEN6_SURFTYPE_NULL;
+   }
+}
+
+static enum pipe_format
+resource_get_image_format(const struct pipe_resource *templ,
+                          const struct ilo_dev *dev,
+                          bool *separate_stencil_ret)
+{
+   enum pipe_format format = templ->format;
+   bool separate_stencil;
+
+   /* silently promote ETC1 */
+   if (templ->format == PIPE_FORMAT_ETC1_RGB8)
+      format = PIPE_FORMAT_R8G8B8X8_UNORM;
+
+   /* separate stencil buffers */
+   separate_stencil = false;
+   if ((templ->bind & PIPE_BIND_DEPTH_STENCIL) &&
+       util_format_is_depth_and_stencil(templ->format)) {
+      switch (templ->format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         /* Gen6 requires HiZ to be available for all levels */
+         if (ilo_dev_gen(dev) >= ILO_GEN(7) || templ->last_level == 0) {
+            format = PIPE_FORMAT_Z32_FLOAT;
+            separate_stencil = true;
+         }
+         break;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         format = PIPE_FORMAT_Z24X8_UNORM;
+         separate_stencil = true;
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (separate_stencil_ret)
+      *separate_stencil_ret = separate_stencil;
+
+   return format;
+}
+
+static inline enum gen_surface_format
+pipe_to_surface_format(const struct ilo_dev *dev, enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS;
+   case PIPE_FORMAT_Z32_FLOAT:
+      return GEN6_FORMAT_R32_FLOAT;
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return GEN6_FORMAT_R24_UNORM_X8_TYPELESS;
+   case PIPE_FORMAT_Z16_UNORM:
+      return GEN6_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_S8_UINT:
+      return GEN6_FORMAT_R8_UINT;
+   default:
+      return ilo_format_translate_color(dev, format);
+   }
+}
+
+static void
+resource_get_image_info(const struct pipe_resource *templ,
+                        const struct ilo_dev *dev,
+                        enum pipe_format image_format,
+                        struct ilo_image_info *info)
+{
+   memset(info, 0, sizeof(*info));
+
+   info->type = get_surface_type(templ->target);
+
+   info->format = pipe_to_surface_format(dev, image_format);
+   info->interleaved_stencil = util_format_is_depth_and_stencil(image_format);
+   info->is_integer = util_format_is_pure_integer(image_format);
+   info->compressed = util_format_is_compressed(image_format);
+   info->block_width = util_format_get_blockwidth(image_format);
+   info->block_height = util_format_get_blockheight(image_format);
+   info->block_size = util_format_get_blocksize(image_format);
+
+   info->width = templ->width0;
+   info->height = templ->height0;
+   info->depth = templ->depth0;
+   info->array_size = templ->array_size;
+   info->level_count = templ->last_level + 1;
+   info->sample_count = (templ->nr_samples) ? templ->nr_samples : 1;
+
+   info->aux_disable = (templ->usage == PIPE_USAGE_STAGING);
+
+   if (templ->bind & PIPE_BIND_LINEAR)
+      info->valid_tilings = 1 << GEN6_TILING_NONE;
+
+   /*
+    * Tiled images must be mapped via GTT to get a linear view.  Prefer linear
+    * images when the image size is greater than one-fourth of the mappable
+    * aperture.
+    */
+   if (templ->bind & (PIPE_BIND_TRANSFER_WRITE | PIPE_BIND_TRANSFER_READ))
+      info->prefer_linear_threshold = dev->aperture_mappable / 4;
+
+   info->bind_surface_sampler = (templ->bind & PIPE_BIND_SAMPLER_VIEW);
+   info->bind_surface_dp_render = (templ->bind & PIPE_BIND_RENDER_TARGET);
+   info->bind_surface_dp_typed = (templ->bind &
+         (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_COMPUTE_RESOURCE));
+   info->bind_zs = (templ->bind & PIPE_BIND_DEPTH_STENCIL);
+   info->bind_scanout = (templ->bind & PIPE_BIND_SCANOUT);
+   info->bind_cursor = (templ->bind & PIPE_BIND_CURSOR);
+}
+
 static enum gen_surface_tiling
 winsys_to_surface_tiling(enum intel_tiling_mode tiling)
 {
@@ -178,8 +311,8 @@ tex_create_bo(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   intel_bo_unref(tex->image.bo);
-   tex->image.bo = bo;
+   intel_bo_unref(tex->vma.bo);
+   ilo_vma_set_bo(&tex->vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -206,7 +339,7 @@ tex_create_separate_stencil(struct ilo_texture *tex)
 
    tex->separate_s8 = ilo_texture(s8);
 
-   assert(tex->separate_s8->image.format == PIPE_FORMAT_S8_UINT);
+   assert(tex->separate_s8->image_format == PIPE_FORMAT_S8_UINT);
 
    return true;
 }
@@ -215,15 +348,16 @@ static bool
 tex_create_hiz(struct ilo_texture *tex)
 {
    const struct pipe_resource *templ = &tex->base;
+   const uint32_t size = tex->image.aux.bo_stride * tex->image.aux.bo_height;
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    struct intel_bo *bo;
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, "hiz texture",
-         tex->image.aux.bo_stride * tex->image.aux.bo_height, false);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, "hiz texture", size, false);
    if (!bo)
       return false;
 
-   tex->image.aux.bo = bo;
+   ilo_vma_init(&tex->aux_vma, &is->dev, size, 4096);
+   ilo_vma_set_bo(&tex->aux_vma, &is->dev, bo, 0);
 
    if (tex->imported) {
       unsigned lv;
@@ -246,17 +380,18 @@ tex_create_hiz(struct ilo_texture *tex)
 static bool
 tex_create_mcs(struct ilo_texture *tex)
 {
+   const uint32_t size = tex->image.aux.bo_stride * tex->image.aux.bo_height;
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    struct intel_bo *bo;
 
    assert(tex->image.aux.enables == (1 << (tex->base.last_level + 1)) - 1);
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, "mcs texture",
-         tex->image.aux.bo_stride * tex->image.aux.bo_height, false);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, "mcs texture", size, false);
    if (!bo)
       return false;
 
-   tex->image.aux.bo = bo;
+   ilo_vma_init(&tex->aux_vma, &is->dev, size, 4096);
+   ilo_vma_set_bo(&tex->aux_vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -267,8 +402,8 @@ tex_destroy(struct ilo_texture *tex)
    if (tex->separate_s8)
       tex_destroy(tex->separate_s8);
 
-   intel_bo_unref(tex->image.bo);
-   intel_bo_unref(tex->image.aux.bo);
+   intel_bo_unref(tex->vma.bo);
+   intel_bo_unref(tex->aux_vma.bo);
 
    tex_free_slices(tex);
    FREE(tex);
@@ -277,24 +412,16 @@ tex_destroy(struct ilo_texture *tex)
 static bool
 tex_alloc_bos(struct ilo_texture *tex)
 {
-   struct ilo_screen *is = ilo_screen(tex->base.screen);
-
    if (!tex->imported && !tex_create_bo(tex))
       return false;
 
-   /* allocate separate stencil resource */
-   if (tex->image.separate_stencil && !tex_create_separate_stencil(tex))
-      return false;
-
    switch (tex->image.aux.type) {
    case ILO_IMAGE_AUX_HIZ:
-      if (!tex_create_hiz(tex) &&
-          !ilo_image_disable_aux(&tex->image, &is->dev))
+      if (!tex_create_hiz(tex))
          return false;
       break;
    case ILO_IMAGE_AUX_MCS:
-      if (!tex_create_mcs(tex) &&
-          !ilo_image_disable_aux(&tex->image, &is->dev))
+      if (!tex_create_mcs(tex))
          return false;
       break;
    default:
@@ -304,9 +431,10 @@ tex_alloc_bos(struct ilo_texture *tex)
    return true;
 }
 
-static bool
+static struct intel_bo *
 tex_import_handle(struct ilo_texture *tex,
-                  const struct winsys_handle *handle)
+                  const struct winsys_handle *handle,
+                  struct ilo_image_info *info)
 {
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    const struct pipe_resource *templ = &tex->base;
@@ -317,45 +445,94 @@ tex_import_handle(struct ilo_texture *tex,
 
    bo = intel_winsys_import_handle(is->dev.winsys, name, handle,
          tex->image.bo_height, &tiling, &pitch);
-   if (!bo)
-      return false;
+   /* modify image info */
+   if (bo) {
+      const uint8_t valid_tilings = 1 << winsys_to_surface_tiling(tiling);
 
-   if (!ilo_image_init_for_imported(&tex->image, &is->dev, templ,
-            winsys_to_surface_tiling(tiling), pitch)) {
-      ilo_err("failed to import handle for texture\n");
-      intel_bo_unref(bo);
-      return false;
+      if (info->valid_tilings && !(info->valid_tilings & valid_tilings)) {
+         intel_bo_unref(bo);
+         return NULL;
+      }
+
+      info->valid_tilings = valid_tilings;
+      info->force_bo_stride = pitch;
+
+      /* assume imported RTs are also scanouts */
+      if (!info->bind_scanout)
+         info->bind_scanout = (templ->usage & PIPE_BIND_RENDER_TARGET);
    }
 
-   tex->image.bo = bo;
-
-   tex->imported = true;
-
-   return true;
+   return bo;
 }
 
 static bool
 tex_init_image(struct ilo_texture *tex,
-               const struct winsys_handle *handle)
+               const struct winsys_handle *handle,
+               bool *separate_stencil)
 {
    struct ilo_screen *is = ilo_screen(tex->base.screen);
    const struct pipe_resource *templ = &tex->base;
    struct ilo_image *img = &tex->image;
+   struct intel_bo *imported_bo = NULL;;
+   struct ilo_image_info info;
+
+   tex->image_format = resource_get_image_format(templ,
+         &is->dev, separate_stencil);
+   resource_get_image_info(templ, &is->dev, tex->image_format, &info);
 
    if (handle) {
-      if (!tex_import_handle(tex, handle))
+      imported_bo = tex_import_handle(tex, handle, &info);
+      if (!imported_bo)
          return false;
-   } else {
-      ilo_image_init(img, &is->dev, templ);
    }
 
-   if (img->bo_height > ilo_max_resource_size / img->bo_stride)
+   if (!ilo_image_init(img, &is->dev, &info)) {
+      intel_bo_unref(imported_bo);
       return false;
+   }
+
+   /*
+    * HiZ requires 8x4 alignment and some levels might need HiZ disabled.  It
+    * is generally fine except on Gen6, where HiZ and separate stencil must be
+    * enabled together.  For PIPE_FORMAT_Z24X8_UNORM with separate stencil, we
+    * can live with stencil values being interleaved for levels where HiZ is
+    * disabled.  But it is not the case for PIPE_FORMAT_Z32_FLOAT with
+    * separate stencil.  If HiZ was disabled for a level, we had to change the
+    * format to PIPE_FORMAT_Z32_FLOAT_S8X24_UINT for the level and that format
+    * had a different bpp.  In other words, HiZ has to be available for all
+    * levels.
+    */
+   if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
+       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+       tex->image_format == PIPE_FORMAT_Z32_FLOAT &&
+       img->aux.enables != (1 << templ->last_level)) {
+      tex->image_format = templ->format;
+      info.format = pipe_to_surface_format(&is->dev, tex->image_format);
+      info.interleaved_stencil = true;
+
+      memset(img, 0, sizeof(*img));
+      if (!ilo_image_init(img, &is->dev, &info)) {
+         intel_bo_unref(imported_bo);
+         return false;
+      }
+   }
+
+   if (img->bo_height > ilo_max_resource_size / img->bo_stride ||
+       !ilo_vma_init(&tex->vma, &is->dev, img->bo_stride * img->bo_height,
+          4096)) {
+      intel_bo_unref(imported_bo);
+      return false;
+   }
+
+   if (imported_bo) {
+      ilo_vma_set_bo(&tex->vma, &is->dev, imported_bo, 0);
+      tex->imported = true;
+   }
 
    if (templ->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
       /* require on-the-fly tiling/untiling or format conversion */
-      if (img->tiling == GEN8_TILING_W || img->separate_stencil ||
-          img->format != templ->format)
+      if (img->tiling == GEN8_TILING_W || *separate_stencil ||
+          tex->image_format != templ->format)
          return false;
    }
 
@@ -371,6 +548,7 @@ tex_create(struct pipe_screen *screen,
            const struct winsys_handle *handle)
 {
    struct ilo_texture *tex;
+   bool separate_stencil;
 
    tex = CALLOC_STRUCT(ilo_texture);
    if (!tex)
@@ -380,12 +558,13 @@ tex_create(struct pipe_screen *screen,
    tex->base.screen = screen;
    pipe_reference_init(&tex->base.reference, 1);
 
-   if (!tex_init_image(tex, handle)) {
+   if (!tex_init_image(tex, handle, &separate_stencil)) {
       FREE(tex);
       return NULL;
    }
 
-   if (!tex_alloc_bos(tex)) {
+   if (!tex_alloc_bos(tex) ||
+       (separate_stencil && !tex_create_separate_stencil(tex))) {
       tex_destroy(tex);
       return NULL;
    }
@@ -406,7 +585,7 @@ tex_get_handle(struct ilo_texture *tex, struct winsys_handle *handle)
    else
       tiling = surface_to_winsys_tiling(tex->image.tiling);
 
-   err = intel_winsys_export_handle(is->dev.winsys, tex->image.bo, tiling,
+   err = intel_winsys_export_handle(is->dev.winsys, tex->vma.bo, tiling,
          tex->image.bo_stride, tex->image.bo_height, handle);
 
    return !err;
@@ -420,13 +599,12 @@ buf_create_bo(struct ilo_buffer_resource *buf)
    const bool cpu_init = resource_get_cpu_init(&buf->base);
    struct intel_bo *bo;
 
-   bo = intel_winsys_alloc_bo(is->dev.winsys, name,
-         buf->buffer.bo_size, cpu_init);
+   bo = intel_winsys_alloc_bo(is->dev.winsys, name, buf->bo_size, cpu_init);
    if (!bo)
       return false;
 
-   intel_bo_unref(buf->buffer.bo);
-   buf->buffer.bo = bo;
+   intel_bo_unref(buf->vma.bo);
+   ilo_vma_set_bo(&buf->vma, &is->dev, bo, 0);
 
    return true;
 }
@@ -434,7 +612,7 @@ buf_create_bo(struct ilo_buffer_resource *buf)
 static void
 buf_destroy(struct ilo_buffer_resource *buf)
 {
-   intel_bo_unref(buf->buffer.bo);
+   intel_bo_unref(buf->vma.bo);
    FREE(buf);
 }
 
@@ -443,6 +621,7 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
 {
    const struct ilo_screen *is = ilo_screen(screen);
    struct ilo_buffer_resource *buf;
+   uint32_t alignment;
    unsigned size;
 
    buf = CALLOC_STRUCT(ilo_buffer_resource);
@@ -471,10 +650,17 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
        ilo_dev_gen(&is->dev) < ILO_GEN(7.5))
       size = align(size, 4096);
 
-   ilo_buffer_init(&buf->buffer, &is->dev, size, templ->bind, templ->flags);
+   if (templ->bind & PIPE_BIND_VERTEX_BUFFER)
+      size = ilo_state_vertex_buffer_size(&is->dev, size, &alignment);
+   if (templ->bind & PIPE_BIND_INDEX_BUFFER)
+      size = ilo_state_index_buffer_size(&is->dev, size, &alignment);
+   if (templ->bind & PIPE_BIND_STREAM_OUTPUT)
+      size = ilo_state_sol_buffer_size(&is->dev, size, &alignment);
 
-   if (buf->buffer.bo_size < templ->width0 ||
-       buf->buffer.bo_size > ilo_max_resource_size ||
+   buf->bo_size = size;
+   ilo_vma_init(&buf->vma, &is->dev, buf->bo_size, 4096);
+
+   if (buf->bo_size < templ->width0 || buf->bo_size > ilo_max_resource_size ||
        !buf_create_bo(buf)) {
       FREE(buf);
       return NULL;
@@ -487,13 +673,30 @@ static boolean
 ilo_can_create_resource(struct pipe_screen *screen,
                         const struct pipe_resource *templ)
 {
+   struct ilo_screen *is = ilo_screen(screen);
+   enum pipe_format image_format;
+   struct ilo_image_info info;
    struct ilo_image img;
 
    if (templ->target == PIPE_BUFFER)
       return (templ->width0 <= ilo_max_resource_size);
 
+   image_format = resource_get_image_format(templ, &is->dev, NULL);
+   resource_get_image_info(templ, &is->dev, image_format, &info);
+
    memset(&img, 0, sizeof(img));
-   ilo_image_init(&img, &ilo_screen(screen)->dev, templ);
+   ilo_image_init(&img, &ilo_screen(screen)->dev, &info);
+
+   /* as in tex_init_image() */
+   if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
+       templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+       image_format == PIPE_FORMAT_Z32_FLOAT &&
+       img.aux.enables != (1 << templ->last_level)) {
+      info.format = pipe_to_surface_format(&is->dev, templ->format);
+      info.interleaved_stencil = true;
+      memset(&img, 0, sizeof(img));
+      ilo_image_init(&img, &ilo_screen(screen)->dev, &info);
+   }
 
    return (img.bo_height <= ilo_max_resource_size / img.bo_stride);
 }
diff --git a/src/gallium/drivers/ilo/ilo_resource.h b/src/gallium/drivers/ilo/ilo_resource.h
index d602e0cbf70..8378af54741 100644
--- a/src/gallium/drivers/ilo/ilo_resource.h
+++ b/src/gallium/drivers/ilo/ilo_resource.h
@@ -29,8 +29,8 @@
 #define ILO_RESOURCE_H
 
 #include "core/intel_winsys.h"
-#include "core/ilo_buffer.h"
 #include "core/ilo_image.h"
+#include "core/ilo_vma.h"
 
 #include "ilo_common.h"
 #include "ilo_screen.h"
@@ -92,7 +92,10 @@ struct ilo_texture {
 
    bool imported;
 
+   enum pipe_format image_format;
    struct ilo_image image;
+   struct ilo_vma vma;
+   struct ilo_vma aux_vma;
 
    /* XXX thread-safety */
    struct ilo_texture_slice *slices[PIPE_MAX_TEXTURE_LEVELS];
@@ -103,14 +106,15 @@ struct ilo_texture {
 struct ilo_buffer_resource {
    struct pipe_resource base;
 
-   struct ilo_buffer buffer;
+   uint32_t bo_size;
+   struct ilo_vma vma;
 };
 
-static inline struct ilo_buffer *
-ilo_buffer(struct pipe_resource *res)
+static inline struct ilo_buffer_resource *
+ilo_buffer_resource(struct pipe_resource *res)
 {
-   return (res && res->target == PIPE_BUFFER) ?
-      &((struct ilo_buffer_resource *) res)->buffer : NULL;
+   return (struct ilo_buffer_resource *)
+      ((res && res->target == PIPE_BUFFER) ? res : NULL);
 }
 
 static inline struct ilo_texture *
@@ -127,13 +131,14 @@ bool
 ilo_resource_rename_bo(struct pipe_resource *res);
 
 /**
- * Return the bo of the resource.
+ * Return the VMA of the resource.
  */
-static inline struct intel_bo *
-ilo_resource_get_bo(struct pipe_resource *res)
+static inline const struct ilo_vma *
+ilo_resource_get_vma(struct pipe_resource *res)
 {
    return (res->target == PIPE_BUFFER) ?
-      ilo_buffer(res)->bo : ilo_texture(res)->image.bo;
+      &((struct ilo_buffer_resource *) res)->vma :
+      &((struct ilo_texture *) res)->vma;
 }
 
 static inline struct ilo_texture_slice *
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 94105559b80..ab4d1377c9f 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -193,6 +193,7 @@ ilo_get_compute_param(struct pipe_screen *screen,
       uint32_t max_clock_frequency;
       uint32_t max_compute_units;
       uint32_t images_supported;
+      uint32_t subgroup_size;
    } val;
    const void *ptr;
    int size;
@@ -284,6 +285,13 @@ ilo_get_compute_param(struct pipe_screen *screen,
       ptr = &val.images_supported;
       size = sizeof(val.images_supported);
       break;
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      /* best case is actually SIMD32 */
+      val.subgroup_size = 16;
+
+      ptr = &val.subgroup_size;
+      size = sizeof(val.subgroup_size);
+      break;
    default:
       ptr = NULL;
       size = 0;
@@ -443,6 +451,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_SM5:
       return 0;
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return true;
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
@@ -457,6 +467,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -665,13 +677,6 @@ ilo_screen_fence_finish(struct pipe_screen *screen,
    return signaled;
 }
 
-static boolean
-ilo_screen_fence_signalled(struct pipe_screen *screen,
-                           struct pipe_fence_handle *fence)
-{
-   return ilo_screen_fence_finish(screen, fence, 0);
-}
-
 /**
  * Create a fence for \p bo.  When \p bo is not NULL, it must be submitted
  * before waited on or checked.
@@ -738,7 +743,6 @@ ilo_screen_create(struct intel_winsys *ws)
    is->base.flush_frontbuffer = NULL;
 
    is->base.fence_reference = ilo_screen_fence_reference;
-   is->base.fence_signalled = ilo_screen_fence_signalled;
    is->base.fence_finish = ilo_screen_fence_finish;
 
    is->base.get_driver_query_info = NULL;
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index 63534f33fa7..d89765a9d23 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -379,13 +379,12 @@ finalize_cbuf_state(struct ilo_context *ilo,
       u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
             cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
 
-      cbuf->cso[i].info.buf = ilo_buffer(cbuf->cso[i].resource);
+      cbuf->cso[i].info.vma = ilo_resource_get_vma(cbuf->cso[i].resource);
       cbuf->cso[i].info.offset = offset;
 
       memset(&cbuf->cso[i].surface, 0, sizeof(cbuf->cso[i].surface));
       ilo_state_surface_init_for_buffer(&cbuf->cso[i].surface,
             ilo->dev, &cbuf->cso[i].info);
-      cbuf->cso[i].surface.bo = cbuf->cso[i].info.buf->bo;
 
       ilo->state_vector.dirty |= ILO_DIRTY_CBUF;
    }
@@ -466,11 +465,9 @@ finalize_index_buffer(struct ilo_context *ilo)
 
    memset(&info, 0, sizeof(info));
    if (vec->ib.hw_resource) {
-      info.buf = ilo_buffer(vec->ib.hw_resource);
-      info.size = info.buf->bo_size;
+      info.vma = ilo_resource_get_vma(vec->ib.hw_resource);
+      info.size = info.vma->vm_size;
       info.format = ilo_translate_index_size(vec->ib.hw_index_size);
-
-      vec->ib.ib.bo = info.buf->bo;
    }
 
    ilo_state_index_buffer_set_info(&vec->ib.ib, dev, &info);
@@ -532,13 +529,11 @@ finalize_vertex_buffers(struct ilo_context *ilo)
       const struct pipe_vertex_buffer *cso = &vec->vb.states[pipe_idx];
 
       if (cso->buffer) {
-         info.buf = ilo_buffer(cso->buffer);
+         info.vma = ilo_resource_get_vma(cso->buffer);
          info.offset = cso->buffer_offset;
-         info.size = info.buf->bo_size;
+         info.size = info.vma->vm_size - cso->buffer_offset;
 
          info.stride = cso->stride;
-
-         vec->vb.vb[i].bo = info.buf->bo;
       } else {
          memset(&info, 0, sizeof(info));
       }
@@ -1566,24 +1561,23 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
          cso->info.size = buf[i].buffer_size;
 
          if (buf[i].buffer) {
-            cso->info.buf = ilo_buffer(buf[i].buffer);
+            cso->info.vma = ilo_resource_get_vma(buf[i].buffer);
             cso->info.offset = buf[i].buffer_offset;
 
             memset(&cso->surface, 0, sizeof(cso->surface));
             ilo_state_surface_init_for_buffer(&cso->surface, dev, &cso->info);
-            cso->surface.bo = cso->info.buf->bo;
 
             cso->user_buffer = NULL;
 
             cbuf->enabled_mask |= 1 << (index + i);
          } else if (buf[i].user_buffer) {
-            cso->info.buf = NULL;
+            cso->info.vma = NULL;
             /* buffer_offset does not apply for user buffer */
             cso->user_buffer = buf[i].user_buffer;
 
             cbuf->enabled_mask |= 1 << (index + i);
          } else {
-            cso->info.buf = NULL;
+            cso->info.vma = NULL;
             cso->info.size = 0;
             cso->user_buffer = NULL;
 
@@ -1596,7 +1590,7 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
 
          pipe_resource_reference(&cso->resource, NULL);
 
-         cso->info.buf = NULL;
+         cso->info.vma = NULL;
          cso->info.size = 0;
          cso->user_buffer = NULL;
 
@@ -1705,10 +1699,11 @@ ilo_set_framebuffer_state(struct pipe_context *pipe,
    if (state->zsbuf) {
       const struct ilo_surface_cso *cso =
          (const struct ilo_surface_cso *) state->zsbuf;
+      const struct ilo_texture *tex = ilo_texture(cso->base.texture);
 
-      fb->has_hiz = cso->u.zs.hiz_bo;
+      fb->has_hiz = cso->u.zs.hiz_vma;
       fb->depth_offset_format =
-         ilo_state_zs_get_depth_format(&cso->u.zs, dev);
+         ilo_format_translate_depth(dev, tex->image_format);
    } else {
       fb->has_hiz = false;
       fb->depth_offset_format = GEN6_ZFORMAT_D32_FLOAT;
@@ -1854,10 +1849,11 @@ ilo_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 }
 
 static void
-ilo_set_shader_resources(struct pipe_context *pipe,
-                         unsigned start, unsigned count,
-                         struct pipe_surface **surfaces)
+ilo_set_shader_images(struct pipe_context *pipe, unsigned shader,
+                      unsigned start, unsigned count,
+                      struct pipe_image_view **views)
 {
+#if 0
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
    struct ilo_resource_state *dst = &vec->resource;
    unsigned i;
@@ -1886,6 +1882,7 @@ ilo_set_shader_resources(struct pipe_context *pipe,
    }
 
    vec->dirty |= ILO_DIRTY_RESOURCE;
+#endif
 }
 
 static void
@@ -1945,12 +1942,11 @@ ilo_create_stream_output_target(struct pipe_context *pipe,
    target->base.buffer_size = buffer_size;
 
    memset(&info, 0, sizeof(info));
-   info.buf = ilo_buffer(res);
+   info.vma = ilo_resource_get_vma(res);
    info.offset = buffer_offset;
    info.size = buffer_size;
 
    ilo_state_sol_buffer_init(&target->sb, dev, &info);
-   target->sb.bo = info.buf->bo;
 
    return &target->base;
 }
@@ -2018,18 +2014,17 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       struct ilo_state_surface_buffer_info info;
 
       memset(&info, 0, sizeof(info));
-      info.buf = ilo_buffer(res);
+      info.vma = ilo_resource_get_vma(res);
+      info.offset = templ->u.buf.first_element * info.struct_size;
+      info.size = (templ->u.buf.last_element -
+            templ->u.buf.first_element + 1) * info.struct_size;
       info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
       info.format = ilo_format_translate_color(dev, templ->format);
       info.format_size = util_format_get_blocksize(templ->format);
       info.struct_size = info.format_size;
       info.readonly = true;
-      info.offset = templ->u.buf.first_element * info.struct_size;
-      info.size = (templ->u.buf.last_element -
-            templ->u.buf.first_element + 1) * info.struct_size;
 
       ilo_state_surface_init_for_buffer(&view->surface, dev, &info);
-      view->surface.bo = info.buf->bo;
    } else {
       struct ilo_texture *tex = ilo_texture(res);
       struct ilo_state_surface_image_info info;
@@ -2042,23 +2037,8 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       }
 
       memset(&info, 0, sizeof(info));
+
       info.img = &tex->image;
-
-      info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
-
-      if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
-          tex->image.separate_stencil) {
-         info.format = ilo_format_translate_texture(dev,
-               PIPE_FORMAT_Z32_FLOAT);
-      } else {
-         info.format = ilo_format_translate_texture(dev, templ->format);
-      }
-
-      info.is_cube_map = (tex->image.target == PIPE_TEXTURE_CUBE ||
-                          tex->image.target == PIPE_TEXTURE_CUBE_ARRAY);
-      info.is_array = util_resource_is_array_texture(&tex->base);
-      info.readonly = true;
-
       info.level_base = templ->u.tex.first_level;
       info.level_count = templ->u.tex.last_level -
          templ->u.tex.first_level + 1;
@@ -2066,8 +2046,22 @@ ilo_create_sampler_view(struct pipe_context *pipe,
       info.slice_count = templ->u.tex.last_layer -
          templ->u.tex.first_layer + 1;
 
+      info.vma = &tex->vma;
+      info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
+      info.type = tex->image.type;
+
+      if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+          tex->separate_s8) {
+         info.format = ilo_format_translate_texture(dev,
+               PIPE_FORMAT_Z32_FLOAT);
+      } else {
+         info.format = ilo_format_translate_texture(dev, templ->format);
+      }
+
+      info.is_array = util_resource_is_array_texture(&tex->base);
+      info.readonly = true;
+
       ilo_state_surface_init_for_image(&view->surface, dev, &info);
-      view->surface.bo = info.img->bo;
    }
 
    return &view->base;
@@ -2111,18 +2105,27 @@ ilo_create_surface(struct pipe_context *pipe,
       assert(tex->base.target != PIPE_BUFFER);
 
       memset(&info, 0, sizeof(info));
+
       info.img = &tex->image;
-      info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
-      info.format = ilo_format_translate_render(dev, templ->format);
-      info.is_array = util_resource_is_array_texture(&tex->base);
       info.level_base = templ->u.tex.level;
       info.level_count = 1;
       info.slice_base = templ->u.tex.first_layer;
       info.slice_count = templ->u.tex.last_layer -
          templ->u.tex.first_layer + 1;
 
+      info.vma = &tex->vma;
+      if (ilo_image_can_enable_aux(&tex->image, templ->u.tex.level))
+         info.aux_vma = &tex->aux_vma;
+
+      info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
+
+      info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ?
+         GEN6_SURFTYPE_2D : tex->image.type;
+
+      info.format = ilo_format_translate_render(dev, templ->format);
+      info.is_array = util_resource_is_array_texture(&tex->base);
+
       ilo_state_surface_init_for_image(&surf->u.rt, dev, &info);
-      surf->u.rt.bo = info.img->bo;
    } else {
       struct ilo_state_zs_info info;
 
@@ -2131,13 +2134,19 @@ ilo_create_surface(struct pipe_context *pipe,
       memset(&info, 0, sizeof(info));
 
       if (templ->format == PIPE_FORMAT_S8_UINT) {
+         info.s_vma = &tex->vma;
          info.s_img = &tex->image;
       } else {
+         info.z_vma = &tex->vma;
          info.z_img = &tex->image;
-         info.s_img = (tex->separate_s8) ? &tex->separate_s8->image : NULL;
 
-         info.hiz_enable =
-            ilo_image_can_enable_aux(&tex->image, templ->u.tex.level);
+         if (tex->separate_s8) {
+            info.s_vma = &tex->separate_s8->vma;
+            info.s_img = &tex->separate_s8->image;
+         }
+
+         if (ilo_image_can_enable_aux(&tex->image, templ->u.tex.level))
+            info.hiz_vma = &tex->aux_vma;
       }
 
       info.level = templ->u.tex.level;
@@ -2145,16 +2154,15 @@ ilo_create_surface(struct pipe_context *pipe,
       info.slice_count = templ->u.tex.last_layer -
          templ->u.tex.first_layer + 1;
 
+      info.type = (tex->image.type == GEN6_SURFTYPE_CUBE) ?
+         GEN6_SURFTYPE_2D : tex->image.type;
+
+      info.format = ilo_format_translate_depth(dev, tex->image_format);
+      if (ilo_dev_gen(dev) == ILO_GEN(6) && !info.hiz_vma &&
+          tex->image_format == PIPE_FORMAT_Z24X8_UNORM)
+         info.format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+
       ilo_state_zs_init(&surf->u.zs, dev, &info);
-
-      if (info.z_img) {
-         surf->u.zs.depth_bo = info.z_img->bo;
-         if (info.hiz_enable)
-            surf->u.zs.hiz_bo = info.z_img->aux.bo;
-      }
-
-      if (info.s_img)
-         surf->u.zs.stencil_bo = info.s_img->bo;
    }
 
    return &surf->base;
@@ -2339,7 +2347,7 @@ ilo_init_state_functions(struct ilo_context *ilo)
    ilo->base.set_scissor_states = ilo_set_scissor_states;
    ilo->base.set_viewport_states = ilo_set_viewport_states;
    ilo->base.set_sampler_views = ilo_set_sampler_views;
-   ilo->base.set_shader_resources = ilo_set_shader_resources;
+   ilo->base.set_shader_images = ilo_set_shader_images;
    ilo->base.set_vertex_buffers = ilo_set_vertex_buffers;
    ilo->base.set_index_buffer = ilo_set_index_buffer;
 
@@ -2451,7 +2459,6 @@ void
 ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
                                   struct pipe_resource *res)
 {
-   struct intel_bo *bo = ilo_resource_get_bo(res);
    uint32_t states = 0;
    unsigned sh, i;
 
@@ -2482,10 +2489,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
 
       for (i = 0; i < vec->so.count; i++) {
          if (vec->so.states[i]->buffer == res) {
-            struct ilo_stream_output_target *target =
-               (struct ilo_stream_output_target *) vec->so.states[i];
-
-            target->sb.bo = ilo_buffer(res)->bo;
             states |= ILO_DIRTY_SO;
             break;
          }
@@ -2503,7 +2506,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
                [PIPE_SHADER_GEOMETRY]  = ILO_DIRTY_VIEW_GS,
                [PIPE_SHADER_COMPUTE]   = ILO_DIRTY_VIEW_CS,
             };
-            cso->surface.bo = bo;
 
             states |= view_dirty_bits[sh];
             break;
@@ -2515,7 +2517,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
             struct ilo_cbuf_cso *cbuf = &vec->cbuf[sh].cso[i];
 
             if (cbuf->resource == res) {
-               cbuf->surface.bo = bo;
                states |= ILO_DIRTY_CBUF;
                break;
             }
@@ -2528,7 +2529,6 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          (struct ilo_surface_cso *) vec->resource.states[i];
 
       if (cso->base.texture == res) {
-         cso->u.rt.bo = bo;
          states |= ILO_DIRTY_RESOURCE;
          break;
       }
@@ -2540,27 +2540,19 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          struct ilo_surface_cso *cso =
             (struct ilo_surface_cso *) vec->fb.state.cbufs[i];
          if (cso && cso->base.texture == res) {
-            cso->u.rt.bo = bo;
             states |= ILO_DIRTY_FB;
             break;
          }
       }
 
-      if (vec->fb.state.zsbuf && vec->fb.state.zsbuf->texture == res) {
-         struct ilo_surface_cso *cso =
-            (struct ilo_surface_cso *) vec->fb.state.zsbuf;
-
-         cso->u.zs.depth_bo = bo;
-
+      if (vec->fb.state.zsbuf && vec->fb.state.zsbuf->texture == res)
          states |= ILO_DIRTY_FB;
-      }
    }
 
    for (i = 0; i < vec->cs_resource.count; i++) {
       struct ilo_surface_cso *cso =
          (struct ilo_surface_cso *) vec->cs_resource.states[i];
       if (cso->base.texture == res) {
-         cso->u.rt.bo = bo;
          states |= ILO_DIRTY_CS_RESOURCE;
          break;
       }
diff --git a/src/gallium/drivers/ilo/ilo_state.h b/src/gallium/drivers/ilo/ilo_state.h
index 3e6fd8a2554..66c93007eb1 100644
--- a/src/gallium/drivers/ilo/ilo_state.h
+++ b/src/gallium/drivers/ilo/ilo_state.h
@@ -202,7 +202,7 @@ struct ilo_cbuf_state {
 };
 
 struct ilo_resource_state {
-   struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES];
+   struct pipe_surface *states[PIPE_MAX_SHADER_IMAGES];
    unsigned count;
 };
 
diff --git a/src/gallium/drivers/ilo/ilo_transfer.c b/src/gallium/drivers/ilo/ilo_transfer.c
index ec41473f94a..5abd3bebf68 100644
--- a/src/gallium/drivers/ilo/ilo_transfer.c
+++ b/src/gallium/drivers/ilo/ilo_transfer.c
@@ -100,7 +100,7 @@ resource_get_transfer_method(struct pipe_resource *res,
             m = ILO_TRANSFER_MAP_SW_ZS;
             need_convert = true;
          }
-      } else if (tex->image.format != tex->base.format) {
+      } else if (tex->image_format != tex->base.format) {
          m = ILO_TRANSFER_MAP_SW_CONVERT;
          need_convert = true;
       }
@@ -268,23 +268,27 @@ xfer_alloc_staging_sys(struct ilo_transfer *xfer)
 static void *
 xfer_map(struct ilo_transfer *xfer)
 {
+   const struct ilo_vma *vma;
    void *ptr;
 
    switch (xfer->method) {
    case ILO_TRANSFER_MAP_CPU:
-      ptr = intel_bo_map(ilo_resource_get_bo(xfer->base.resource),
-            xfer->base.usage & PIPE_TRANSFER_WRITE);
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map(vma->bo, xfer->base.usage & PIPE_TRANSFER_WRITE);
       break;
    case ILO_TRANSFER_MAP_GTT:
-      ptr = intel_bo_map_gtt(ilo_resource_get_bo(xfer->base.resource));
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map_gtt(vma->bo);
       break;
    case ILO_TRANSFER_MAP_GTT_ASYNC:
-      ptr = intel_bo_map_gtt_async(ilo_resource_get_bo(xfer->base.resource));
+      vma = ilo_resource_get_vma(xfer->base.resource);
+      ptr = intel_bo_map_gtt_async(vma->bo);
       break;
    case ILO_TRANSFER_MAP_STAGING:
       {
          const struct ilo_screen *is = ilo_screen(xfer->staging.res->screen);
-         struct intel_bo *bo = ilo_resource_get_bo(xfer->staging.res);
+
+         vma = ilo_resource_get_vma(xfer->staging.res);
 
          /*
           * We want a writable, optionally persistent and coherent, mapping
@@ -292,25 +296,29 @@ xfer_map(struct ilo_transfer *xfer)
           * this turns out to be fairly simple.
           */
          if (is->dev.has_llc)
-            ptr = intel_bo_map(bo, true);
+            ptr = intel_bo_map(vma->bo, true);
          else
-            ptr = intel_bo_map_gtt(bo);
+            ptr = intel_bo_map_gtt(vma->bo);
 
          if (ptr && xfer->staging.res->target == PIPE_BUFFER)
             ptr += (xfer->base.box.x % ILO_TRANSFER_MAP_BUFFER_ALIGNMENT);
-
       }
       break;
    case ILO_TRANSFER_MAP_SW_CONVERT:
    case ILO_TRANSFER_MAP_SW_ZS:
+      vma = NULL;
       ptr = xfer->staging.sys;
       break;
    default:
       assert(!"unknown mapping method");
+      vma = NULL;
       ptr = NULL;
       break;
    }
 
+   if (ptr && vma)
+      ptr = (void *) ((char *) ptr + vma->bo_offset);
+
    return ptr;
 }
 
@@ -324,10 +332,10 @@ xfer_unmap(struct ilo_transfer *xfer)
    case ILO_TRANSFER_MAP_CPU:
    case ILO_TRANSFER_MAP_GTT:
    case ILO_TRANSFER_MAP_GTT_ASYNC:
-      intel_bo_unmap(ilo_resource_get_bo(xfer->base.resource));
+      intel_bo_unmap(ilo_resource_get_vma(xfer->base.resource)->bo);
       break;
    case ILO_TRANSFER_MAP_STAGING:
-      intel_bo_unmap(ilo_resource_get_bo(xfer->staging.res));
+      intel_bo_unmap(ilo_resource_get_vma(xfer->staging.res)->bo);
       break;
    default:
       break;
@@ -541,9 +549,12 @@ tex_staging_sys_map_bo(struct ilo_texture *tex,
 
    if (prefer_cpu && (tex->image.tiling == GEN6_TILING_NONE ||
                       !linear_view))
-      ptr = intel_bo_map(tex->image.bo, !for_read_back);
+      ptr = intel_bo_map(tex->vma.bo, !for_read_back);
    else
-      ptr = intel_bo_map_gtt(tex->image.bo);
+      ptr = intel_bo_map_gtt(tex->vma.bo);
+
+   if (ptr)
+      ptr = (void *) ((char *) ptr + tex->vma.bo_offset);
 
    return ptr;
 }
@@ -551,7 +562,7 @@ tex_staging_sys_map_bo(struct ilo_texture *tex,
 static void
 tex_staging_sys_unmap_bo(struct ilo_texture *tex)
 {
-   intel_bo_unmap(tex->image.bo);
+   intel_bo_unmap(tex->vma.bo);
 }
 
 static bool
@@ -590,7 +601,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       s8_tile_offset = tex_tile_choose_offset_func(s8_tex, &s8_tiles_per_row);
 
       if (tex->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
-         assert(tex->image.format == PIPE_FORMAT_Z24X8_UNORM);
+         assert(tex->image_format == PIPE_FORMAT_Z24X8_UNORM);
 
          dst_cpp = 4;
          dst_s8_pos = 3;
@@ -598,7 +609,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       }
       else {
          assert(tex->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         assert(tex->image.format == PIPE_FORMAT_Z32_FLOAT);
+         assert(tex->image_format == PIPE_FORMAT_Z32_FLOAT);
 
          dst_cpp = 8;
          dst_s8_pos = 4;
@@ -644,7 +655,7 @@ tex_staging_sys_zs_read(struct ilo_texture *tex,
       tex_staging_sys_unmap_bo(s8_tex);
    }
    else {
-      assert(tex->image.format == PIPE_FORMAT_S8_UINT);
+      assert(tex->image_format == PIPE_FORMAT_S8_UINT);
 
       for (slice = 0; slice < box->depth; slice++) {
          unsigned mem_x, mem_y;
@@ -717,7 +728,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       s8_tile_offset = tex_tile_choose_offset_func(s8_tex, &s8_tiles_per_row);
 
       if (tex->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
-         assert(tex->image.format == PIPE_FORMAT_Z24X8_UNORM);
+         assert(tex->image_format == PIPE_FORMAT_Z24X8_UNORM);
 
          src_cpp = 4;
          src_s8_pos = 3;
@@ -725,7 +736,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       }
       else {
          assert(tex->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         assert(tex->image.format == PIPE_FORMAT_Z32_FLOAT);
+         assert(tex->image_format == PIPE_FORMAT_Z32_FLOAT);
 
          src_cpp = 8;
          src_s8_pos = 4;
@@ -771,7 +782,7 @@ tex_staging_sys_zs_write(struct ilo_texture *tex,
       tex_staging_sys_unmap_bo(s8_tex);
    }
    else {
-      assert(tex->image.format == PIPE_FORMAT_S8_UINT);
+      assert(tex->image_format == PIPE_FORMAT_S8_UINT);
 
       for (slice = 0; slice < box->depth; slice++) {
          unsigned mem_x, mem_y;
@@ -829,8 +840,8 @@ tex_staging_sys_convert_write(struct ilo_texture *tex,
    else
       dst_slice_stride = 0;
 
-   if (unlikely(tex->image.format == tex->base.format)) {
-      util_copy_box(dst, tex->image.format, tex->image.bo_stride,
+   if (unlikely(tex->image_format == tex->base.format)) {
+      util_copy_box(dst, tex->image_format, tex->image.bo_stride,
             dst_slice_stride, 0, 0, 0, box->width, box->height, box->depth,
             xfer->staging.sys, xfer->base.stride, xfer->base.layer_stride,
             0, 0, 0);
@@ -842,7 +853,7 @@ tex_staging_sys_convert_write(struct ilo_texture *tex,
 
    switch (tex->base.format) {
    case PIPE_FORMAT_ETC1_RGB8:
-      assert(tex->image.format == PIPE_FORMAT_R8G8B8X8_UNORM);
+      assert(tex->image_format == PIPE_FORMAT_R8G8B8X8_UNORM);
 
       for (slice = 0; slice < box->depth; slice++) {
          const void *src =
@@ -1055,7 +1066,7 @@ choose_transfer_method(struct ilo_context *ilo, struct ilo_transfer *xfer)
       return false;
 
    /* see if we can avoid blocking */
-   if (is_bo_busy(ilo, ilo_resource_get_bo(res), &need_submit)) {
+   if (is_bo_busy(ilo, ilo_resource_get_vma(res)->bo, &need_submit)) {
       bool resource_renamed;
 
       if (!xfer_unblock(xfer, &resource_renamed)) {
@@ -1078,11 +1089,11 @@ static void
 buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
            unsigned usage, int offset, int size, const void *data)
 {
-   struct ilo_buffer *buf = ilo_buffer(res);
+   struct ilo_buffer_resource *buf = ilo_buffer_resource(res);
    bool need_submit;
 
    /* see if we can avoid blocking */
-   if (is_bo_busy(ilo, buf->bo, &need_submit)) {
+   if (is_bo_busy(ilo, buf->vma.bo, &need_submit)) {
       bool unblocked = false;
 
       if ((usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) &&
@@ -1103,9 +1114,12 @@ buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
          templ.bind = PIPE_BIND_TRANSFER_WRITE;
          staging = ilo->base.screen->resource_create(ilo->base.screen, &templ);
          if (staging) {
+            const struct ilo_vma *staging_vma = ilo_resource_get_vma(staging);
             struct pipe_box staging_box;
 
-            intel_bo_pwrite(ilo_buffer(staging)->bo, 0, size, data);
+            /* offset by staging_vma->bo_offset for pwrite */
+            intel_bo_pwrite(staging_vma->bo, staging_vma->bo_offset,
+                  size, data);
 
             u_box_1d(0, size, &staging_box);
             ilo_blitter_blt_copy_resource(ilo->blitter,
@@ -1123,7 +1137,8 @@ buf_pwrite(struct ilo_context *ilo, struct pipe_resource *res,
          ilo_cp_submit(ilo->cp, "syncing for pwrites");
    }
 
-   intel_bo_pwrite(buf->bo, offset, size, data);
+   /* offset by buf->vma.bo_offset for pwrite */
+   intel_bo_pwrite(buf->vma.bo, buf->vma.bo_offset + offset, size, data);
 }
 
 static void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
index 1de43f77ee0..1feb415c9e5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
@@ -78,7 +78,7 @@ lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
 /**
  * Whether the blending factors are complementary of each other.
  */
-static INLINE boolean
+static inline boolean
 lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
 {
    return dst_factor == (src_factor ^ 0x10);
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 0d47c0d517c..c273b25f096 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -169,7 +169,7 @@ llvmpipe_user_buffer_create(struct pipe_screen *screen,
                             unsigned bind_flags);
 
 
-static INLINE struct llvmpipe_context *
+static inline struct llvmpipe_context *
 llvmpipe_context( struct pipe_context *pipe )
 {
    return (struct llvmpipe_context *)pipe;
diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h
index e0f7d8e1bc3..1038c5fe151 100644
--- a/src/gallium/drivers/llvmpipe/lp_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -71,7 +71,7 @@ extern int LP_DEBUG;
 
 void st_debug_init( void );
 
-static INLINE void
+static inline void
 LP_DBG( unsigned flag, const char *fmt, ... )
 {
     if (LP_DEBUG & flag)
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h
index 3c591187801..d7f0c153ec8 100644
--- a/src/gallium/drivers/llvmpipe/lp_fence.h
+++ b/src/gallium/drivers/llvmpipe/lp_fence.h
@@ -72,7 +72,7 @@ llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
 void
 lp_fence_destroy(struct lp_fence *fence);
 
-static INLINE void
+static inline void
 lp_fence_reference(struct lp_fence **ptr,
                    struct lp_fence *f)
 {
@@ -85,7 +85,7 @@ lp_fence_reference(struct lp_fence **ptr,
    *ptr = f;
 }
 
-static INLINE boolean
+static inline boolean
 lp_fence_issued(const struct lp_fence *fence)
 {
    return fence->issued;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c209f47f0f5..c19f9318006 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -184,7 +184,7 @@ union lp_rast_cmd_arg {
 
 /* Cast wrappers.  Hopefully these compile to noops!
  */
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
 {
    union lp_rast_cmd_arg arg;
@@ -192,7 +192,7 @@ lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_triangle( const struct lp_rast_triangle *triangle,
                       unsigned plane_mask)
 {
@@ -208,7 +208,7 @@ lp_rast_arg_triangle( const struct lp_rast_triangle *triangle,
  * All planes are enabled, so instead of the plane mask we pass the upper
  * left coordinates of the a block that fully encloses the triangle.
  */
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle,
                                 unsigned x, unsigned y)
 {
@@ -218,7 +218,7 @@ lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle,
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_state( const struct lp_rast_state *state )
 {
    union lp_rast_cmd_arg arg;
@@ -226,7 +226,7 @@ lp_rast_arg_state( const struct lp_rast_state *state )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_fence( struct lp_fence *fence )
 {
    union lp_rast_cmd_arg arg;
@@ -235,7 +235,7 @@ lp_rast_arg_fence( struct lp_fence *fence )
 }
 
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_clearzs( uint64_t value, uint64_t mask )
 {
    union lp_rast_cmd_arg arg;
@@ -245,7 +245,7 @@ lp_rast_arg_clearzs( uint64_t value, uint64_t mask )
 }
 
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_query( struct llvmpipe_query *pq )
 {
    union lp_rast_cmd_arg arg;
@@ -253,7 +253,7 @@ lp_rast_arg_query( struct llvmpipe_query *pq )
    return arg;
 }
 
-static INLINE union lp_rast_cmd_arg
+static inline union lp_rast_cmd_arg
 lp_rast_arg_null( void )
 {
    union lp_rast_cmd_arg arg;
@@ -312,7 +312,7 @@ lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
 #include <emmintrin.h>
 #include "util/u_sse.h"
 
-static INLINE __m128i
+static inline __m128i
 lp_plane_to_m128i(const struct lp_rast_plane *plane)
 {
    return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index e6ebbcd526d..9aa7e874657 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -145,7 +145,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
  * Get the pointer to a 4x4 color block (within a 64x64 tile).
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE uint8_t *
+static inline uint8_t *
 lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
                                 unsigned buf, unsigned x, unsigned y,
                                 unsigned layer)
@@ -186,7 +186,7 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
  * Get the pointer to a 4x4 depth block (within a 64x64 tile).
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE uint8_t *
+static inline uint8_t *
 lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task,
                                 unsigned x, unsigned y, unsigned layer)
 {
@@ -222,7 +222,7 @@ lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task,
  * triangle in/out tests.
  * \param x, y location of 4x4 block in window coords
  */
-static INLINE void
+static inline void
 lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
                          const struct lp_rast_shader_inputs *inputs,
                          unsigned x, unsigned y )
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 41f6fbfa059..c9b9221d87c 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -63,7 +63,7 @@ block_full_16(struct lp_rasterizer_task *task,
 	 block_full_4(task, tri, x + ix, y + iy);
 }
 
-static INLINE unsigned
+static inline unsigned
 build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
 {
    unsigned mask = 0;
@@ -94,7 +94,7 @@ build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
 }
 
 
-static INLINE void
+static inline void
 build_masks(int64_t c,
             int64_t cdiff,
             int64_t dcdx,
@@ -167,7 +167,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #include "util/u_sse.h"
 
 
-static INLINE void
+static inline void
 build_masks_32(int c, 
                int cdiff,
                int dcdx,
@@ -213,7 +213,7 @@ build_masks_32(int c,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 build_mask_linear_32(int c, int dcdx, int dcdy)
 {
    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
@@ -239,7 +239,7 @@ build_mask_linear_32(int c, int dcdx, int dcdy)
    return _mm_movemask_epi8(result);
 }
 
-static INLINE unsigned
+static inline unsigned
 sign_bits4(const __m128i *cstep, int cdiff)
 {
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index a226ff0c485..b1464bb54c4 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -207,7 +207,7 @@ boolean lp_scene_is_resource_referenced(const struct lp_scene *scene,
  * Allocate space for a command/data in the bin's data buffer.
  * Grow the block list if needed.
  */
-static INLINE void *
+static inline void *
 lp_scene_alloc( struct lp_scene *scene, unsigned size)
 {
    struct data_block_list *list = &scene->data;
@@ -240,7 +240,7 @@ lp_scene_alloc( struct lp_scene *scene, unsigned size)
 /**
  * As above, but with specific alignment.
  */
-static INLINE void *
+static inline void *
 lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
 			unsigned alignment )
 {
@@ -272,7 +272,7 @@ lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
 
 /* Put back data if we decide not to use it, eg. culled triangles.
  */
-static INLINE void
+static inline void
 lp_scene_putback_data( struct lp_scene *scene, unsigned size)
 {
    struct data_block_list *list = &scene->data;
@@ -282,7 +282,7 @@ lp_scene_putback_data( struct lp_scene *scene, unsigned size)
 
 
 /** Return pointer to a particular tile's bin. */
-static INLINE struct cmd_bin *
+static inline struct cmd_bin *
 lp_scene_get_bin(struct lp_scene *scene, unsigned x, unsigned y)
 {
    return &scene->tile[x][y];
@@ -296,7 +296,7 @@ lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y);
 
 /* Add a command to bin[x][y].
  */
-static INLINE boolean
+static inline boolean
 lp_scene_bin_command( struct lp_scene *scene,
                       unsigned x, unsigned y,
                       unsigned cmd,
@@ -328,7 +328,7 @@ lp_scene_bin_command( struct lp_scene *scene,
 }
 
 
-static INLINE boolean
+static inline boolean
 lp_scene_bin_cmd_with_state( struct lp_scene *scene,
                              unsigned x, unsigned y,
                              const struct lp_rast_state *state,
@@ -354,7 +354,7 @@ lp_scene_bin_cmd_with_state( struct lp_scene *scene,
 
 /* Add a command to all active bins.
  */
-static INLINE boolean
+static inline boolean
 lp_scene_bin_everywhere( struct lp_scene *scene,
 			 unsigned cmd,
 			 const union lp_rast_cmd_arg arg )
@@ -371,7 +371,7 @@ lp_scene_bin_everywhere( struct lp_scene *scene,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 lp_scene_get_num_bins( const struct lp_scene *scene )
 {
    return scene->tiles_x * scene->tiles_y;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 47f1897c732..14eeab03387 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -288,10 +288,14 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
       return 0;
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
    /* should only get here on unhandled cases */
@@ -528,18 +532,6 @@ llvmpipe_fence_reference(struct pipe_screen *screen,
 }
 
 
-/**
- * Has the fence been executed/finished?
- */
-static boolean
-llvmpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence)
-{
-   struct lp_fence *f = (struct lp_fence *) fence;
-   return lp_fence_signalled(f);
-}
-
-
 /**
  * Wait for the fence to finish.
  */
@@ -550,6 +542,9 @@ llvmpipe_fence_finish(struct pipe_screen *screen,
 {
    struct lp_fence *f = (struct lp_fence *) fence_handle;
 
+   if (!timeout)
+      return lp_fence_signalled(f);
+
    lp_fence_wait(f);
    return TRUE;
 }
@@ -601,7 +596,6 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
    screen->base.context_create = llvmpipe_create_context;
    screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
    screen->base.fence_reference = llvmpipe_fence_reference;
-   screen->base.fence_signalled = llvmpipe_fence_signalled;
    screen->base.fence_finish = llvmpipe_fence_finish;
 
    screen->base.get_timestamp = llvmpipe_get_timestamp;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.h b/src/gallium/drivers/llvmpipe/lp_screen.h
index 8b8ea1afac9..00bf20c8c5f 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.h
+++ b/src/gallium/drivers/llvmpipe/lp_screen.h
@@ -62,7 +62,7 @@ struct llvmpipe_screen
 
 
 
-static INLINE struct llvmpipe_screen *
+static inline struct llvmpipe_screen *
 llvmpipe_screen( struct pipe_screen *pipe )
 {
    return (struct llvmpipe_screen *)pipe;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index c944ad26756..a42df2dc9e0 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -159,7 +159,7 @@ void
 lp_setup_end_query(struct lp_setup_context *setup,
                    struct llvmpipe_query *pq);
 
-static INLINE unsigned
+static inline unsigned
 lp_clamp_viewport_idx(int idx)
 {
    return (PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 6c05b90e64a..a190254d9df 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -233,7 +233,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
 
 
 
-static INLINE int subpixel_snap( float a )
+static inline int subpixel_snap( float a )
 {
    return util_iround(FIXED_ONE * a);
 }
@@ -262,14 +262,14 @@ print_line(struct lp_setup_context *setup,
 }
 
 
-static INLINE boolean sign(float x){
+static inline boolean sign(float x){
    return x >= 0;  
 }  
 
 
 /* Used on positive floats only:
  */
-static INLINE float fracf(float f)
+static inline float fracf(float f)
 {
    return f - floorf(f);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index f065676a7fb..75544b52493 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -296,7 +296,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
 }
 
 
-static INLINE int
+static inline int
 subpixel_snap(float a)
 {
    return util_iround(FIXED_ONE * a);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index a2f55ed3a1e..98a9d4bc28b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -48,13 +48,13 @@
 #include <emmintrin.h>
 #endif
 
-static INLINE int
+static inline int
 subpixel_snap(float a)
 {
    return util_iround(FIXED_ONE * a);
 }
 
-static INLINE float
+static inline float
 fixed_to_float(int a)
 {
    return a * (1.0f / FIXED_ONE);
@@ -579,7 +579,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
  *
  * Undefined if no bit set exists, so code should check against 0 first.
  */
-static INLINE uint32_t 
+static inline uint32_t 
 floor_pot(uint32_t n)
 {
 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
@@ -841,7 +841,7 @@ static void retry_triangle_ccw( struct lp_setup_context *setup,
 /**
  * Calculate fixed position data for a triangle
  */
-static INLINE void
+static inline void
 calc_fixed_position( struct lp_setup_context *setup,
                      struct fixed_position* position,
                      const float (*v0)[4],
@@ -873,7 +873,7 @@ calc_fixed_position( struct lp_setup_context *setup,
  * Rotate a triangle, flipping its clockwise direction,
  * Swaps values for xy[0] and xy[1]
  */
-static INLINE void
+static inline void
 rotate_fixed_position_01( struct fixed_position* position )
 {
    int x, y;
@@ -898,7 +898,7 @@ rotate_fixed_position_01( struct fixed_position* position )
  * Rotate a triangle, flipping its clockwise direction,
  * Swaps values for xy[1] and xy[2]
  */
-static INLINE void
+static inline void
 rotate_fixed_position_12( struct fixed_position* position )
 {
    int x, y;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
index 89992007849..534c5f48a64 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -122,7 +122,7 @@ lp_setup_set_primitive(struct vbuf_render *vbr, unsigned prim)
 
 typedef const float (*const_float4_ptr)[4];
 
-static INLINE const_float4_ptr get_vert( const void *vertex_buffer,
+static inline const_float4_ptr get_vert( const void *vertex_buffer,
                                          int index,
                                          int stride )
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index b5ce8683f1a..fd6c49aacd8 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -840,7 +840,7 @@ store_unswizzled_block(struct gallivm_state *gallivm,
  *
  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
  */
-static INLINE boolean
+static inline boolean
 is_arithmetic_format(const struct util_format_description *format_desc)
 {
    boolean arith = false;
@@ -860,7 +860,7 @@ is_arithmetic_format(const struct util_format_description *format_desc)
  * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
  * SoA conversion.
  */
-static INLINE boolean
+static inline boolean
 format_expands_to_float_soa(const struct util_format_description *format_desc)
 {
    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
@@ -876,7 +876,7 @@ format_expands_to_float_soa(const struct util_format_description *format_desc)
  *
  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
  */
-static INLINE void
+static inline void
 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
                              struct lp_type* type)
 {
@@ -924,7 +924,7 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
  *
  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
  */
-static INLINE void
+static inline void
 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
                                struct lp_type* type)
 {
@@ -996,7 +996,7 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc
  *
  * but we try to avoid division and multiplication through shifts.
  */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
 scale_bits(struct gallivm_state *gallivm,
            int src_bits,
            int dst_bits,
@@ -1108,7 +1108,7 @@ scale_bits(struct gallivm_state *gallivm,
 /**
  * If RT is a smallfloat (needing denorms) format
  */
-static INLINE int
+static inline int
 have_smallfloat_format(struct lp_type dst_type,
                        enum pipe_format format)
 {
@@ -2880,7 +2880,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
 /**
  * Return the blend factor equivalent to a destination alpha of one.
  */
-static INLINE unsigned
+static inline unsigned
 force_dst_alpha_one(unsigned factor, boolean clamped_zero)
 {
    switch(factor) {
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index 4b6c8a7a6a5..e1b51c9c9a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -77,7 +77,7 @@ unsigned __int64 __rdtsc();
 
 #elif defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
 
-static INLINE uint64_t
+static inline uint64_t
 rdtsc(void)
 {
    uint32_t hi, lo;
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h
index 9fbd3a21648..3d315bb9a73 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -106,21 +106,21 @@ struct llvmpipe_transfer
 
 
 /** cast wrappers */
-static INLINE struct llvmpipe_resource *
+static inline struct llvmpipe_resource *
 llvmpipe_resource(struct pipe_resource *pt)
 {
    return (struct llvmpipe_resource *) pt;
 }
 
 
-static INLINE const struct llvmpipe_resource *
+static inline const struct llvmpipe_resource *
 llvmpipe_resource_const(const struct pipe_resource *pt)
 {
    return (const struct llvmpipe_resource *) pt;
 }
 
 
-static INLINE struct llvmpipe_transfer *
+static inline struct llvmpipe_transfer *
 llvmpipe_transfer(struct pipe_transfer *pt)
 {
    return (struct llvmpipe_transfer *) pt;
@@ -131,7 +131,7 @@ void llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen);
 void llvmpipe_init_context_resource_funcs(struct pipe_context *pipe);
 
 
-static INLINE boolean
+static inline boolean
 llvmpipe_resource_is_texture(const struct pipe_resource *resource)
 {
    switch (resource->target) {
@@ -153,7 +153,7 @@ llvmpipe_resource_is_texture(const struct pipe_resource *resource)
 }
 
 
-static INLINE boolean
+static inline boolean
 llvmpipe_resource_is_1d(const struct pipe_resource *resource)
 {
    switch (resource->target) {
@@ -175,7 +175,7 @@ llvmpipe_resource_is_1d(const struct pipe_resource *resource)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 llvmpipe_layer_stride(struct pipe_resource *resource,
                       unsigned level)
 {
@@ -185,7 +185,7 @@ llvmpipe_layer_stride(struct pipe_resource *resource,
 }
 
 
-static INLINE unsigned
+static inline unsigned
 llvmpipe_resource_stride(struct pipe_resource *resource,
                          unsigned level)
 {
diff --git a/src/gallium/drivers/nouveau/Makefile.am b/src/gallium/drivers/nouveau/Makefile.am
index d05f0a17ab4..c52d62e54a2 100644
--- a/src/gallium/drivers/nouveau/Makefile.am
+++ b/src/gallium/drivers/nouveau/Makefile.am
@@ -20,8 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index ca3c806e92f..cce60550ae5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1153,8 +1153,8 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info)
 
    switch (info->type) {
    PROG_TYPE_CASE(VERTEX, VERTEX);
-// PROG_TYPE_CASE(HULL, TESSELLATION_CONTROL);
-// PROG_TYPE_CASE(DOMAIN, TESSELLATION_EVAL);
+   PROG_TYPE_CASE(TESS_CTRL, TESSELLATION_CONTROL);
+   PROG_TYPE_CASE(TESS_EVAL, TESSELLATION_EVAL);
    PROG_TYPE_CASE(GEOMETRY, GEOMETRY);
    PROG_TYPE_CASE(FRAGMENT, FRAGMENT);
    PROG_TYPE_CASE(COMPUTE, COMPUTE);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 529dcb9bdc2..3ddaeafebbd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -106,6 +106,7 @@ enum operation
    OP_MEMBAR, // memory barrier (mfence, lfence, sfence)
    OP_VFETCH, // indirection 0 in attribute space, indirection 1 is vertex base
    OP_PFETCH, // fetch base address of vertex src0 (immediate) [+ src1]
+   OP_AFETCH, // fetch base address of shader input (a[%r1+0x10])
    OP_EXPORT,
    OP_LINTERP,
    OP_PINTERP,
@@ -372,7 +373,8 @@ enum SVSemantic
    SV_SAMPLE_INDEX,
    SV_SAMPLE_POS,
    SV_SAMPLE_MASK,
-   SV_TESS_FACTOR,
+   SV_TESS_OUTER,
+   SV_TESS_INNER,
    SV_TESS_COORD,
    SV_TID,
    SV_CTAID,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
index 51b9225156b..fa8ee072a92 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -332,6 +332,9 @@ BasicBlock::splitBefore(Instruction *insn, bool attach)
    BasicBlock *bb = new BasicBlock(func);
    assert(!insn || insn->op != OP_PHI);
 
+   bb->joinAt = joinAt;
+   joinAt = NULL;
+
    splitCommon(insn, bb, attach);
    return bb;
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 708c5b322ee..19418c0e0f1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -428,8 +428,7 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
 {
    Symbol *sym = new_Symbol(prog, FILE_SYSTEM_VALUE, 0);
 
-   assert(svIndex < 4 ||
-          (svName == SV_CLIP_DISTANCE || svName == SV_TESS_FACTOR));
+   assert(svIndex < 4 || svName == SV_CLIP_DISTANCE);
 
    switch (svName) {
    case SV_POSITION:
@@ -438,7 +437,9 @@ BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
    case SV_POINT_SIZE:
    case SV_POINT_COORD:
    case SV_CLIP_DISTANCE:
-   case SV_TESS_FACTOR:
+   case SV_TESS_OUTER:
+   case SV_TESS_INNER:
+   case SV_TESS_COORD:
       sym->reg.type = TYPE_F32;
       break;
    default:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index dba56bf2716..2b9edcf9172 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -69,18 +69,6 @@ struct nv50_ir_varying
 # define NV50_IR_DEBUG_REG_ALLOC 0
 #endif
 
-#define NV50_SEMANTIC_CLIPDISTANCE  (TGSI_SEMANTIC_COUNT + 0)
-#define NV50_SEMANTIC_TESSFACTOR    (TGSI_SEMANTIC_COUNT + 7)
-#define NV50_SEMANTIC_TESSCOORD     (TGSI_SEMANTIC_COUNT + 8)
-#define NV50_SEMANTIC_COUNT         (TGSI_SEMANTIC_COUNT + 10)
-
-#define NV50_TESS_PART_FRACT_ODD  0
-#define NV50_TESS_PART_FRACT_EVEN 1
-#define NV50_TESS_PART_POW2       2
-#define NV50_TESS_PART_INTEGER    3
-
-#define NV50_PRIM_PATCHES PIPE_PRIM_MAX
-
 struct nv50_ir_prog_symbol
 {
    uint32_t label;
@@ -151,10 +139,10 @@ struct nv50_ir_prog_info
       } gp;
       struct {
          unsigned numColourResults;
-         boolean writesDepth;
-         boolean earlyFragTests;
-         boolean separateFragData;
-         boolean usesDiscard;
+         bool writesDepth;
+         bool earlyFragTests;
+         bool separateFragData;
+         bool usesDiscard;
       } fp;
       struct {
          uint32_t inputOffset; /* base address for user args */
@@ -180,11 +168,11 @@ struct nv50_ir_prog_info
       int8_t viewportId;         /* output index of ViewportIndex */
       uint8_t fragDepth;         /* output index of FragDepth */
       uint8_t sampleMask;        /* output index of SampleMask */
-      boolean sampleInterp;      /* perform sample interp on all fp inputs */
+      bool sampleInterp;         /* perform sample interp on all fp inputs */
       uint8_t backFaceColor[2];  /* input/output indices of back face colour */
       uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
-      boolean fp64;              /* program uses fp64 math */
-      boolean nv50styleSurfaces; /* generate gX[] access for raw buffers */
+      bool fp64;                 /* program uses fp64 math */
+      bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
       uint8_t resInfoCBSlot;     /* cX[] used for tex handles, surface info */
       uint16_t texBindBase;      /* base address for tex handles (nve4) */
       uint16_t suInfoBase;       /* base address for surface info (nve4) */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index ab8bf2e5504..f06056f8f17 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -77,6 +77,7 @@ private:
    void emitMOV(const Instruction *);
 
    void emitINTERP(const Instruction *);
+   void emitAFETCH(const Instruction *);
    void emitPFETCH(const Instruction *);
    void emitVFETCH(const Instruction *);
    void emitEXPORT(const Instruction *);
@@ -120,6 +121,8 @@ private:
 
    void emitPIXLD(const Instruction *);
 
+   void emitBAR(const Instruction *);
+
    void emitFlow(const Instruction *);
 
    inline void defId(const ValueDef&, const int pos);
@@ -1249,6 +1252,13 @@ CodeEmitterGK110::emitPIXLD(const Instruction *i)
    code[1] |= 0x00070000;
 }
 
+void
+CodeEmitterGK110::emitBAR(const Instruction *i)
+{
+   /* TODO */
+   emitNOP(i);
+}
+
 void
 CodeEmitterGK110::emitFlow(const Instruction *i)
 {
@@ -1329,6 +1339,23 @@ CodeEmitterGK110::emitFlow(const Instruction *i)
    }
 }
 
+void
+CodeEmitterGK110::emitAFETCH(const Instruction *i)
+{
+   uint32_t offset = i->src(0).get()->reg.data.offset & 0x7ff;
+
+   code[0] = 0x00000002 | (offset << 23);
+   code[1] = 0x7d000000 | (offset >> 9);
+
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[1] |= 0x8;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+}
+
 void
 CodeEmitterGK110::emitPFETCH(const Instruction *i)
 {
@@ -1698,6 +1725,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
    case OP_EXPORT:
       emitEXPORT(insn);
       break;
+   case OP_AFETCH:
+      emitAFETCH(insn);
+      break;
    case OP_PFETCH:
       emitPFETCH(insn);
       break;
@@ -1856,6 +1886,9 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
       emitNOP(insn);
       insn->join = 1;
       break;
+   case OP_BAR:
+      emitBAR(insn);
+      break;
    case OP_PHI:
    case OP_UNION:
    case OP_CONSTRAINT:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 399a6f1db13..ef5c87d0437 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -174,6 +174,7 @@ private:
    void emitALD();
    void emitAST();
    void emitISBERD();
+   void emitAL2P();
    void emitIPA();
 
    void emitPIXLD();
@@ -2203,6 +2204,17 @@ CodeEmitterGM107::emitISBERD()
    emitGPR (0x00, insn->def(0));
 }
 
+void
+CodeEmitterGM107::emitAL2P()
+{
+   emitInsn (0xefa00000);
+   emitField(0x2f, 2, (insn->getDef(0)->reg.size / 4) - 1);
+   emitO    (0x20);
+   emitField(0x14, 11, insn->src(0).get()->reg.data.offset);
+   emitGPR  (0x08, insn->src(0).getIndirect(0));
+   emitGPR  (0x00, insn->def(0));
+}
+
 void
 CodeEmitterGM107::emitIPA()
 {
@@ -2441,8 +2453,14 @@ CodeEmitterGM107::emitTXQ()
       break;
    }
 
-   emitInsn (0xdf4a0000);
-   emitField(0x24, 13, insn->tex.r);
+   if (insn->tex.rIndirectSrc >= 0) {
+      emitInsn (0xdf500000);
+   } else {
+      emitInsn (0xdf480000);
+      emitField(0x24, 13, insn->tex.r);
+   }
+
+   emitField(0x31, 1, insn->tex.liveOnly);
    emitField(0x1f, 4, insn->tex.mask);
    emitField(0x16, 6, type);
    emitGPR  (0x08, insn->src(0));
@@ -2753,6 +2771,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
    case OP_PFETCH:
       emitISBERD();
       break;
+   case OP_AFETCH:
+      emitAL2P();
+      break;
    case OP_LINTERP:
    case OP_PINTERP:
       emitIPA();
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 1bfc8e32e84..67ea6df773c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -499,10 +499,14 @@ CodeEmitterNV50::emitForm_MAD(const Instruction *i)
    setSrc(i, 2, 2);
 
    if (i->getIndirect(0, 0)) {
-      assert(!i->getIndirect(1, 0));
+      assert(!i->srcExists(1) || !i->getIndirect(1, 0));
+      assert(!i->srcExists(2) || !i->getIndirect(2, 0));
       setAReg16(i, 0);
-   } else {
+   } else if (i->srcExists(1) && i->getIndirect(1, 0)) {
+      assert(!i->srcExists(2) || !i->getIndirect(2, 0));
       setAReg16(i, 1);
+   } else {
+      setAReg16(i, 2);
    }
 }
 
@@ -546,7 +550,7 @@ CodeEmitterNV50::emitForm_MUL(const Instruction *i)
 }
 
 // usual immediate form
-// - 1 to 3 sources where last is immediate (rir, gir)
+// - 1 to 3 sources where second is immediate (rir, gir)
 // - no address or predicate possible
 void
 CodeEmitterNV50::emitForm_IMM(const Instruction *i)
@@ -562,7 +566,7 @@ CodeEmitterNV50::emitForm_IMM(const Instruction *i)
    if (Target::operationSrcNr[i->op] > 1) {
       setSrc(i, 0, 0);
       setImmediate(i, 1);
-      setSrc(i, 2, 1);
+      // If there is another source, it has to be the same as the dest reg.
    } else {
       setImmediate(i, 0);
    }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 472e3a84119..f607f3ba3ec 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -85,6 +85,7 @@ private:
    void emitCCTL(const Instruction *);
 
    void emitINTERP(const Instruction *);
+   void emitAFETCH(const Instruction *);
    void emitPFETCH(const Instruction *);
    void emitVFETCH(const Instruction *);
    void emitEXPORT(const Instruction *);
@@ -1450,6 +1451,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
       ImmediateValue *imm = i->getSrc(0)->asImm();
       assert(imm);
       code[0] |= imm->reg.data.u32 << 20;
+      code[1] |= 0x8000;
    }
 
    // thread count
@@ -1460,6 +1462,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
       assert(imm);
       code[0] |= imm->reg.data.u32 << 26;
       code[1] |= imm->reg.data.u32 >> 6;
+      code[1] |= 0x4000;
    }
 
    if (i->srcExists(2) && (i->predSrc != 2)) {
@@ -1493,6 +1496,21 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
    }
 }
 
+void
+CodeEmitterNVC0::emitAFETCH(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x0c000000 | (i->src(0).get()->reg.data.offset & 0x7ff);
+
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[0] |= 0x200;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+   srcId(i->src(0).getIndirect(0), 20);
+}
+
 void
 CodeEmitterNVC0::emitPFETCH(const Instruction *i)
 {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index ecd115f9807..4847a0f3355 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -372,6 +372,10 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_SAMPLEPOS:  return nv50_ir::SV_SAMPLE_POS;
    case TGSI_SEMANTIC_SAMPLEMASK: return nv50_ir::SV_SAMPLE_MASK;
    case TGSI_SEMANTIC_INVOCATIONID: return nv50_ir::SV_INVOCATION_ID;
+   case TGSI_SEMANTIC_TESSCOORD:  return nv50_ir::SV_TESS_COORD;
+   case TGSI_SEMANTIC_TESSOUTER:  return nv50_ir::SV_TESS_OUTER;
+   case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
+   case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
    default:
       assert(0);
       return nv50_ir::SV_CLOCK;
@@ -434,7 +438,6 @@ nv50_ir::DataType Instruction::inferSrcType() const
    case TGSI_OPCODE_USLT:
    case TGSI_OPCODE_USNE:
    case TGSI_OPCODE_USHR:
-   case TGSI_OPCODE_UCMP:
    case TGSI_OPCODE_ATOMUADD:
    case TGSI_OPCODE_ATOMXCHG:
    case TGSI_OPCODE_ATOMCAS:
@@ -827,7 +830,7 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
    if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
       tgsi_dump(tokens, 0);
 
-   mainTempsInLMem = FALSE;
+   mainTempsInLMem = false;
 }
 
 Source::~Source()
@@ -938,7 +941,7 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
       info->prop.gp.instanceCount = prop->u[0].Data;
       break;
    case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-      info->prop.fp.separateFragData = TRUE;
+      info->prop.fp.separateFragData = true;
       break;
    case TGSI_PROPERTY_FS_COORD_ORIGIN:
    case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
@@ -947,6 +950,24 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
    case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
       info->io.genUserClip = -1;
       break;
+   case TGSI_PROPERTY_TCS_VERTICES_OUT:
+      info->prop.tp.outputPatchSize = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_PRIM_MODE:
+      info->prop.tp.domain = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_SPACING:
+      info->prop.tp.partitioning = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_VERTEX_ORDER_CW:
+      info->prop.tp.winding = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_TES_POINT_MODE:
+      if (prop->u[0].Data)
+         info->prop.tp.outputPrim = PIPE_PRIM_POINTS;
+      else
+         info->prop.tp.outputPrim = PIPE_PRIM_TRIANGLES; /* anything but points */
+      break;
    default:
       INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
       break;
@@ -1035,6 +1056,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
                if (decl->Interp.Location || info->io.sampleInterp)
                   info->in[i].centroid = 1;
             }
+
+            if (sn == TGSI_SEMANTIC_PATCH)
+               info->in[i].patch = 1;
+            if (sn == TGSI_SEMANTIC_PATCH)
+               info->numPatchConstants = MAX2(info->numPatchConstants, si + 1);
          }
       }
       break;
@@ -1069,6 +1095,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          case TGSI_SEMANTIC_VIEWPORT_INDEX:
             info->io.viewportId = i;
             break;
+         case TGSI_SEMANTIC_PATCH:
+            info->numPatchConstants = MAX2(info->numPatchConstants, si + 1);
+            /* fallthrough */
+         case TGSI_SEMANTIC_TESSOUTER:
+         case TGSI_SEMANTIC_TESSINNER:
+            info->out[i].patch = 1;
+            break;
          default:
             break;
          }
@@ -1092,6 +1125,13 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          info->sv[i].sn = sn;
          info->sv[i].si = si;
          info->sv[i].input = inferSysValDirection(sn);
+
+         switch (sn) {
+         case TGSI_SEMANTIC_TESSOUTER:
+         case TGSI_SEMANTIC_TESSINNER:
+            info->sv[i].patch = 1;
+            break;
+         }
       }
       break;
    case TGSI_FILE_RESOURCE:
@@ -1156,7 +1196,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       } else
       if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
          if (insn.getDst(0).isIndirect(0))
-            mainTempsInLMem = TRUE;
+            mainTempsInLMem = true;
       }
    }
 
@@ -1164,12 +1204,22 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       Instruction::SrcRegister src = insn.getSrc(s);
       if (src.getFile() == TGSI_FILE_TEMPORARY) {
          if (src.isIndirect(0))
-            mainTempsInLMem = TRUE;
+            mainTempsInLMem = true;
       } else
       if (src.getFile() == TGSI_FILE_RESOURCE) {
          if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
             info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
                0x1 : 0x2;
+      } else
+      if (src.getFile() == TGSI_FILE_OUTPUT) {
+         if (src.isIndirect(0)) {
+            // We don't know which one is accessed, just mark everything for
+            // reading. This is an extremely unlikely occurrence.
+            for (unsigned i = 0; i < info->numOutputs; ++i)
+               info->out[i].oread = 1;
+         } else {
+            info->out[src.getIndex(0)].oread = 1;
+         }
       }
       if (src.getFile() != TGSI_FILE_INPUT)
          continue;
@@ -1246,6 +1296,7 @@ private:
 
    Value *shiftAddress(Value *);
    Value *getVertexBase(int s);
+   Value *getOutputBase(int s);
    DataArray *getArrayForFile(unsigned file, int idx);
    Value *fetchSrc(int s, int c);
    Value *acquireDst(int d, int c);
@@ -1343,6 +1394,8 @@ private:
    Value *vtxBase[5]; // base address of vertex in primitive (for TP/GP)
    uint8_t vtxBaseValid;
 
+   Value *outBase; // base address of vertex out patch (for TCP)
+
    Stack condBBs;  // fork BB, then else clause BB
    Stack joinBBs;  // fork BB, for inserting join ops on ENDIF
    Stack loopBBs;  // loop headers
@@ -1475,6 +1528,22 @@ Converter::getVertexBase(int s)
    return vtxBase[s];
 }
 
+Value *
+Converter::getOutputBase(int s)
+{
+   assert(s < 5);
+   if (!(vtxBaseValid & (1 << s))) {
+      Value *offset = loadImm(NULL, tgsi.getSrc(s).getIndex(1));
+      if (tgsi.getSrc(s).isIndirect(1))
+         offset = mkOp2v(OP_ADD, TYPE_U32, getSSA(),
+                         fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL),
+                         offset);
+      vtxBaseValid |= 1 << s;
+      vtxBase[s] = mkOp2v(OP_ADD, TYPE_U32, getSSA(), outBase, offset);
+   }
+   return vtxBase[s];
+}
+
 Value *
 Converter::fetchSrc(int s, int c)
 {
@@ -1488,6 +1557,9 @@ Converter::fetchSrc(int s, int c)
 
    if (src.is2D()) {
       switch (src.getFile()) {
+      case TGSI_FILE_OUTPUT:
+         dimRel = getOutputBase(s);
+         break;
       case TGSI_FILE_INPUT:
          dimRel = getVertexBase(s);
          break;
@@ -1542,6 +1614,7 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
    const int idx2d = src.is2D() ? src.getIndex(1) : 0;
    const int idx = src.getIndex(0);
    const int swz = src.getSwizzle(c);
+   Instruction *ld;
 
    switch (src.getFile()) {
    case TGSI_FILE_IMMEDIATE:
@@ -1569,13 +1642,19 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
          if (ptr)
             return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
       }
-      return mkLoadv(TYPE_U32, srcToSym(src, c), shiftAddress(ptr));
+      ld = mkLoad(TYPE_U32, getSSA(), srcToSym(src, c), shiftAddress(ptr));
+      ld->perPatch = info->in[idx].patch;
+      return ld->getDef(0);
    case TGSI_FILE_OUTPUT:
-      assert(!"load from output file");
-      return NULL;
+      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+      ld = mkLoad(TYPE_U32, getSSA(), srcToSym(src, c), shiftAddress(ptr));
+      ld->perPatch = info->out[idx].patch;
+      return ld->getDef(0);
    case TGSI_FILE_SYSTEM_VALUE:
       assert(!ptr);
-      return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+      ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+      ld->perPatch = info->sv[idx].patch;
+      return ld->getDef(0);
    default:
       return getArrayForFile(src.getFile(), idx2d)->load(
          sub.cur->values, idx, swz, shiftAddress(ptr));
@@ -1645,7 +1724,8 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
              viewport != NULL)
             mkOp1(OP_MOV, TYPE_U32, viewport, val);
          else
-            mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val);
+            mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val)->perPatch =
+               info->out[idx].patch;
       }
    } else
    if (f == TGSI_FILE_TEMPORARY ||
@@ -1687,6 +1767,7 @@ Converter::insertConvergenceOps(BasicBlock *conv, BasicBlock *fork)
    join->fixed = 1;
    conv->insertHead(join);
 
+   assert(!fork->joinAt);
    fork->joinAt = new_FlowInstruction(func, OP_JOINAT, conv);
    fork->insertBefore(fork->getExit(), fork->joinAt);
 }
@@ -1728,7 +1809,7 @@ Converter::handleTXQ(Value *dst0[4], enum TexQuery query)
    }
    tex->setSrc((c = 0), fetchSrc(0, 0)); // mip level
 
-   setTexRS(tex, c, 1, -1);
+   setTexRS(tex, ++c, 1, -1);
 
    bb->insertTail(tex);
 }
@@ -2569,6 +2650,8 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       }
       break;
    case TGSI_OPCODE_UCMP:
+      srcTy = TYPE_U32;
+      /* fallthrough */
    case TGSI_OPCODE_CMP:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
          src0 = fetchSrc(0, c);
@@ -3282,10 +3365,21 @@ Converter::run()
          clipVtx[c] = getScratch();
    }
 
-   if (prog->getType() == Program::TYPE_FRAGMENT) {
+   switch (prog->getType()) {
+   case Program::TYPE_TESSELLATION_CONTROL:
+      outBase = mkOp2v(
+         OP_SUB, TYPE_U32, getSSA(),
+         mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LANEID, 0)),
+         mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_INVOCATION_ID, 0)));
+      break;
+   case Program::TYPE_FRAGMENT: {
       Symbol *sv = mkSysVal(SV_POSITION, 3);
       fragCoord[3] = mkOp1v(OP_RDSV, TYPE_F32, getSSA(), sv);
       mkOp1(OP_RCP, TYPE_F32, fragCoord[3], fragCoord[3]);
+      break;
+   }
+   default:
+      break;
    }
 
    if (info->io.viewportId >= 0)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 596ac95d489..1f3fce2bb9a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -176,7 +176,7 @@ GM107LoweringPass::handlePOPCNT(Instruction *i)
                            i->getSrc(0), i->getSrc(1));
    i->setSrc(0, tmp);
    i->setSrc(1, NULL);
-   return TRUE;
+   return true;
 }
 
 //
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 2c7f7e326b2..bea293bac99 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -871,6 +871,7 @@ NV50LoweringPreSSA::handleTXL(TexInstruction *i)
    BasicBlock *joinBB = i->bb->splitAfter(i);
 
    bld.setPosition(currBB, true);
+   assert(!currBB->joinAt);
    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
 
    for (int l = 0; l <= 3; ++l) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 7a5d1ce0299..c3c302da5c8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -559,6 +559,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
       } else
       if (i->isNop()) {
          bb->remove(i);
+      } else
+      if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
+          prog->getType() != Program::TYPE_COMPUTE) {
+         // It seems like barriers are never required for tessellation since
+         // the warp size is 32, and there are always at most 32 tcs threads.
+         bb->remove(i);
       } else {
          // TODO: Move this to before register allocation for operations that
          // need the $c register !
@@ -956,7 +962,43 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
 bool
 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
 {
-   // TODO: indirect resource/sampler index
+   if (txq->tex.rIndirectSrc < 0)
+      return true;
+
+   Value *ticRel = txq->getIndirectR();
+   const int chipset = prog->getTarget()->getChipset();
+
+   txq->setIndirectS(NULL);
+   txq->tex.sIndirectSrc = -1;
+
+   assert(ticRel);
+
+   if (chipset < NVISA_GK104_CHIPSET) {
+      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+      txq->setSrc(txq->tex.rIndirectSrc, NULL);
+      if (txq->tex.r)
+         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                             ticRel, bld.mkImm(txq->tex.r));
+
+      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
+
+      txq->moveSources(0, 1);
+      txq->setSrc(0, src);
+   } else {
+      Value *hnd = loadTexHandle(
+            bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                       txq->getIndirectR(), bld.mkImm(2)),
+            txq->tex.r);
+      txq->tex.r = 0xff;
+      txq->tex.s = 0x1f;
+
+      txq->setIndirectR(NULL);
+      txq->moveSources(0, 1);
+      txq->setSrc(0, hnd);
+      txq->tex.rIndirectSrc = 0;
+   }
+
    return true;
 }
 
@@ -1485,6 +1527,10 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
          i->op = OP_MOV;
          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
       }
+      if (sv == SV_VERTEX_COUNT) {
+         bld.setPosition(i, true);
+         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
+      }
       return true;
    }
 
@@ -1554,7 +1600,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
       break;
    default:
-      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
+      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
       ld = bld.mkFetch(i->getDef(0), i->dType,
                        FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
@@ -1705,6 +1751,7 @@ NVC0LoweringPass::checkPredicate(Instruction *insn)
 bool
 NVC0LoweringPass::visit(Instruction *i)
 {
+   bool ret = true;
    bld.setPosition(i, false);
 
    if (i->cc != CC_ALWAYS)
@@ -1736,7 +1783,8 @@ NVC0LoweringPass::visit(Instruction *i)
    case OP_SQRT:
       return handleSQRT(i);
    case OP_EXPORT:
-      return handleEXPORT(i);
+      ret = handleEXPORT(i);
+      break;
    case OP_EMIT:
    case OP_RESTART:
       return handleOUT(i);
@@ -1775,6 +1823,9 @@ NVC0LoweringPass::visit(Instruction *i)
             i->setIndirect(0, 0, ptr);
             i->subOp = NV50_IR_SUBOP_LDC_IS;
          }
+      } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
+         assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+         i->op = OP_VFETCH;
       }
       break;
    case OP_ATOM:
@@ -1796,7 +1847,20 @@ NVC0LoweringPass::visit(Instruction *i)
    default:
       break;
    }
-   return true;
+
+   /* Kepler+ has a special opcode to compute a new base address to be used
+    * for indirect loads.
+    */
+   if (targ->getChipset() >= NVISA_GK104_CHIPSET && !i->perPatch &&
+       (i->op == OP_VFETCH || i->op == OP_EXPORT) && i->src(0).isIndirect(0)) {
+      Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
+                                      cloneShallow(func, i->getSrc(0)));
+      afetch->setIndirect(0, 0, i->getIndirect(0, 0));
+      i->src(0).get()->reg.data.offset = 0;
+      i->setIndirect(0, 0, afetch->getDef(0));
+   }
+
+   return ret;
 }
 
 bool
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index ae739eeda83..cea96dcdfc5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -608,9 +608,12 @@ ConstantFolding::expr(Instruction *i,
    case OP_FMA: {
       i->op = OP_ADD;
 
+      /* Move the immediate to the second arg, otherwise the ADD operation
+       * won't be emittable
+       */
       i->setSrc(1, i->getSrc(0));
-      i->src(1).mod = i->src(2).mod;
       i->setSrc(0, i->getSrc(2));
+      i->src(0).mod = i->src(2).mod;
       i->setSrc(2, NULL);
 
       ImmediateValue src0;
@@ -2082,6 +2085,8 @@ MemoryOpt::runOpt(BasicBlock *bb)
       }
       if (ldst->getPredicate()) // TODO: handle predicated ld/st
          continue;
+      if (ldst->perPatch) // TODO: create separate per-patch lists
+         continue;
 
       if (isLoad) {
          DataFile file = ldst->src(0).getFile();
@@ -2515,6 +2520,8 @@ Instruction::isResultEqual(const Instruction *that) const
       case FILE_MEMORY_CONST:
       case FILE_SHADER_INPUT:
          return true;
+      case FILE_SHADER_OUTPUT:
+         return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
       default:
          return false;
       }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index ef3de6ff92a..9ebdc6586db 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -135,6 +135,7 @@ const char *operationStr[OP_LAST + 1] =
    "membar",
    "vfetch",
    "pfetch",
+   "afetch",
    "export",
    "linterp",
    "pinterp",
@@ -258,7 +259,8 @@ static const char *SemanticStr[SV_LAST + 1] =
    "SAMPLE_INDEX",
    "SAMPLE_POS",
    "SAMPLE_MASK",
-   "TESS_FACTOR",
+   "TESS_OUTER",
+   "TESS_INNER",
    "TESS_COORD",
    "TID",
    "CTAID",
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 898653c9953..78bc97f4397 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -2066,6 +2066,8 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
          condenseDefs(i);
          if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8)
             addHazard(i, i->src(0).getIndirect(0));
+         if (i->src(0).isIndirect(1) && typeSizeof(i->dType) >= 8)
+            addHazard(i, i->src(0).getIndirect(1));
       } else
       if (i->op == OP_UNION ||
           i->op == OP_MERGE ||
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index 7992f539782..fe530c76b62 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -41,7 +41,7 @@ const uint8_t Target::operationSrcNr[] =
    0, 0, 0, 0, 0,          // BRA, CALL, RET, CONT, BREAK,
    0, 0, 0,                // PRERET,CONT,BREAK
    0, 0, 0, 0, 0, 0,       // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR
-   1, 1, 2, 1, 2,          // VFETCH, PFETCH, EXPORT, LINTERP, PINTERP
+   1, 1, 1, 2, 1, 2,       // VFETCH, PFETCH, AFETCH, EXPORT, LINTERP, PINTERP
    1, 1,                   // EMIT, RESTART
    1, 1, 1,                // TEX, TXB, TXL,
    1, 1, 1, 1, 1, 1, 2,    // TXF, TXQ, TXD, TXG, TXLQ, TEXCSAA, TEXPREP
@@ -96,8 +96,8 @@ const OpClass Target::operationClass[] =
    OPCLASS_FLOW, OPCLASS_FLOW,
    // MEMBAR
    OPCLASS_CONTROL,
-   // VFETCH, PFETCH, EXPORT
-   OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE,
+   // VFETCH, PFETCH, AFETCH, EXPORT
+   OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_STORE,
    // LINTERP, PINTERP
    OPCLASS_SFU, OPCLASS_SFU,
    // EMIT, RESTART
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index ca545a6024a..f3ddcaa5199 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -118,7 +118,7 @@ void TargetNV50::initOpInfo()
    static const uint32_t shortForm[(OP_LAST + 31) / 32] =
    {
       // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF
-      0x00014e40, 0x00000040, 0x00000498, 0x00000000
+      0x00014e40, 0x00000040, 0x00000930, 0x00000000
    };
    static const operation noDestList[] =
    {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 7d4a859dde4..27df0eba66b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -286,7 +286,8 @@ TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
    case SV_CLIP_DISTANCE:  return 0x2c0 + idx * 4;
    case SV_POINT_COORD:    return 0x2e0 + idx * 4;
    case SV_FACE:           return 0x3fc;
-   case SV_TESS_FACTOR:    return 0x000 + idx * 4;
+   case SV_TESS_OUTER:     return 0x000 + idx * 4;
+   case SV_TESS_INNER:     return 0x010 + idx * 4;
    case SV_TESS_COORD:     return 0x2f0 + idx * 4;
    case SV_NTID:           return kepler ? (0x00 + idx * 4) : ~0;
    case SV_NCTAID:         return kepler ? (0x0c + idx * 4) : ~0;
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 09cdbb53ecb..67e181e803a 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -22,13 +22,13 @@ struct nouveau_transfer {
    uint32_t offset;
 };
 
-static INLINE struct nouveau_transfer *
+static inline struct nouveau_transfer *
 nouveau_transfer(struct pipe_transfer *transfer)
 {
    return (struct nouveau_transfer *)transfer;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_malloc(struct nv04_resource *buf)
 {
    if (!buf->data)
@@ -36,16 +36,11 @@ nouveau_buffer_malloc(struct nv04_resource *buf)
    return !!buf->data;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_allocate(struct nouveau_screen *screen,
                         struct nv04_resource *buf, unsigned domain)
 {
-   uint32_t size = buf->base.width0;
-
-   if (buf->base.bind & (PIPE_BIND_CONSTANT_BUFFER |
-                         PIPE_BIND_COMPUTE_RESOURCE |
-                         PIPE_BIND_SHADER_RESOURCE))
-      size = align(size, 0x100);
+   uint32_t size = align(buf->base.width0, 0x100);
 
    if (domain == NOUVEAU_BO_VRAM) {
       buf->mm = nouveau_mm_allocate(screen->mm_VRAM, size,
@@ -58,12 +53,12 @@ nouveau_buffer_allocate(struct nouveau_screen *screen,
       buf->mm = nouveau_mm_allocate(screen->mm_GART, size,
                                     &buf->bo, &buf->offset);
       if (!buf->bo)
-         return FALSE;
+         return false;
       NOUVEAU_DRV_STAT(screen, buf_obj_current_bytes_sys, buf->base.width0);
    } else {
       assert(domain == 0);
       if (!nouveau_buffer_malloc(buf))
-         return FALSE;
+         return false;
    }
    buf->domain = domain;
    if (buf->bo)
@@ -71,10 +66,10 @@ nouveau_buffer_allocate(struct nouveau_screen *screen,
 
    util_range_set_empty(&buf->valid_buffer_range);
 
-   return TRUE;
+   return true;
 }
 
-static INLINE void
+static inline void
 release_allocation(struct nouveau_mm_allocation **mm,
                    struct nouveau_fence *fence)
 {
@@ -82,7 +77,7 @@ release_allocation(struct nouveau_mm_allocation **mm,
    (*mm) = NULL;
 }
 
-INLINE void
+inline void
 nouveau_buffer_release_gpu_storage(struct nv04_resource *buf)
 {
    nouveau_bo_ref(NULL, &buf->bo);
@@ -98,7 +93,7 @@ nouveau_buffer_release_gpu_storage(struct nv04_resource *buf)
    buf->domain = 0;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_reallocate(struct nouveau_screen *screen,
                           struct nv04_resource *buf, unsigned domain)
 {
@@ -139,13 +134,13 @@ nouveau_buffer_destroy(struct pipe_screen *pscreen,
  */
 static uint8_t *
 nouveau_transfer_staging(struct nouveau_context *nv,
-                         struct nouveau_transfer *tx, boolean permit_pb)
+                         struct nouveau_transfer *tx, bool permit_pb)
 {
    const unsigned adj = tx->base.box.x & NOUVEAU_MIN_BUFFER_MAP_ALIGN_MASK;
    const unsigned size = align(tx->base.box.width, 4) + adj;
 
    if (!nv->push_data)
-      permit_pb = FALSE;
+      permit_pb = false;
 
    if ((size <= NOUVEAU_TRANSFER_PUSHBUF_THRESHOLD) && permit_pb) {
       tx->map = align_malloc(size, NOUVEAU_MIN_BUFFER_MAP_ALIGN);
@@ -167,7 +162,7 @@ nouveau_transfer_staging(struct nouveau_context *nv,
  * buffer. Also updates buf->data if present.
  *
  * Maybe just migrate to GART right away if we actually need to do this. */
-static boolean
+static bool
 nouveau_transfer_read(struct nouveau_context *nv, struct nouveau_transfer *tx)
 {
    struct nv04_resource *buf = nv04_resource(tx->base.resource);
@@ -180,12 +175,12 @@ nouveau_transfer_read(struct nouveau_context *nv, struct nouveau_transfer *tx)
                  buf->bo, buf->offset + base, buf->domain, size);
 
    if (nouveau_bo_wait(tx->bo, NOUVEAU_BO_RD, nv->client))
-      return FALSE;
+      return false;
 
    if (buf->data)
       memcpy(buf->data + base, tx->map, size);
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -195,7 +190,7 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
    struct nv04_resource *buf = nv04_resource(tx->base.resource);
    uint8_t *data = tx->map + offset;
    const unsigned base = tx->base.box.x + offset;
-   const boolean can_cb = !((base | size) & 3);
+   const bool can_cb = !((base | size) & 3);
 
    if (buf->data)
       memcpy(data, buf->data + base, size);
@@ -224,32 +219,32 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
 /* Does a CPU wait for the buffer's backing data to become reliably accessible
  * for write/read by waiting on the buffer's relevant fences.
  */
-static INLINE boolean
+static inline bool
 nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ) {
       if (!buf->fence_wr)
-         return TRUE;
+         return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence_wr));
       if (!nouveau_fence_wait(buf->fence_wr))
-         return FALSE;
+         return false;
    } else {
       if (!buf->fence)
-         return TRUE;
+         return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence));
       if (!nouveau_fence_wait(buf->fence))
-         return FALSE;
+         return false;
 
       nouveau_fence_ref(NULL, &buf->fence);
    }
    nouveau_fence_ref(NULL, &buf->fence_wr);
 
-   return TRUE;
+   return true;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ)
@@ -258,7 +253,7 @@ nouveau_buffer_busy(struct nv04_resource *buf, unsigned rw)
       return (buf->fence && !nouveau_fence_signalled(buf->fence));
 }
 
-static INLINE void
+static inline void
 nouveau_buffer_transfer_init(struct nouveau_transfer *tx,
                              struct pipe_resource *resource,
                              const struct pipe_box *box,
@@ -280,7 +275,7 @@ nouveau_buffer_transfer_init(struct nouveau_transfer *tx,
    tx->map = NULL;
 }
 
-static INLINE void
+static inline void
 nouveau_buffer_transfer_del(struct nouveau_context *nv,
                             struct nouveau_transfer *tx)
 {
@@ -297,11 +292,11 @@ nouveau_buffer_transfer_del(struct nouveau_context *nv,
 }
 
 /* Creates a cache in system memory of the buffer data. */
-static boolean
+static bool
 nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
 {
    struct nouveau_transfer tx;
-   boolean ret;
+   bool ret;
    tx.base.resource = &buf->base;
    tx.base.box.x = 0;
    tx.base.box.width = buf->base.width0;
@@ -310,13 +305,13 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
 
    if (!buf->data)
       if (!nouveau_buffer_malloc(buf))
-         return FALSE;
+         return false;
    if (!(buf->status & NOUVEAU_BUFFER_STATUS_DIRTY))
-      return TRUE;
+      return true;
    nv->stats.buf_cache_count++;
 
-   if (!nouveau_transfer_staging(nv, &tx, FALSE))
-      return FALSE;
+   if (!nouveau_transfer_staging(nv, &tx, false))
+      return false;
 
    ret = nouveau_transfer_read(nv, &tx);
    if (ret) {
@@ -335,15 +330,15 @@ nouveau_buffer_cache(struct nouveau_context *nv, struct nv04_resource *buf)
  * resource. This can be useful if we would otherwise have to wait for a read
  * operation to complete on this data.
  */
-static INLINE boolean
+static inline bool
 nouveau_buffer_should_discard(struct nv04_resource *buf, unsigned usage)
 {
    if (!(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE))
-      return FALSE;
+      return false;
    if (unlikely(buf->base.bind & PIPE_BIND_SHARED))
-      return FALSE;
+      return false;
    if (unlikely(usage & PIPE_TRANSFER_PERSISTENT))
-      return FALSE;
+      return false;
    return buf->mm && nouveau_buffer_busy(buf, PIPE_TRANSFER_WRITE);
 }
 
@@ -413,7 +408,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
           * back into VRAM on unmap. */
          if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)
             buf->status &= NOUVEAU_BUFFER_STATUS_REALLOC_MASK;
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
       } else {
          if (buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
             /* The GPU is currently writing to this buffer. Copy its current
@@ -424,13 +419,13 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
                align_free(buf->data);
                buf->data = NULL;
             }
-            nouveau_transfer_staging(nv, tx, FALSE);
+            nouveau_transfer_staging(nv, tx, false);
             nouveau_transfer_read(nv, tx);
          } else {
             /* The buffer is currently idle. Create a staging area for writes,
              * and make sure that the cached data is up-to-date. */
             if (usage & PIPE_TRANSFER_WRITE)
-               nouveau_transfer_staging(nv, tx, TRUE);
+               nouveau_transfer_staging(nv, tx, true);
             if (!buf->data)
                nouveau_buffer_cache(nv, buf);
          }
@@ -482,7 +477,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
       if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
          /* The whole range is being discarded, so it doesn't matter what was
           * there before. No need to copy anything over. */
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
          map = tx->map;
       } else
       if (nouveau_buffer_busy(buf, PIPE_TRANSFER_READ)) {
@@ -493,7 +488,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
       } else {
          /* It is expected that the returned buffer be a representation of the
           * data in question, so we must copy it over from the buffer. */
-         nouveau_transfer_staging(nv, tx, TRUE);
+         nouveau_transfer_staging(nv, tx, true);
          if (tx->map)
             memcpy(tx->map, map, box->width);
          map = tx->map;
@@ -544,7 +539,7 @@ nouveau_buffer_transfer_unmap(struct pipe_context *pipe,
          const uint8_t bind = buf->base.bind;
          /* make sure we invalidate dedicated caches */
          if (bind & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER))
-            nv->vbo_dirty = TRUE;
+            nv->vbo_dirty = true;
       }
 
       util_range_add(&buf->valid_buffer_range,
@@ -639,7 +634,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
 {
    struct nouveau_screen *screen = nouveau_screen(pscreen);
    struct nv04_resource *buffer;
-   boolean ret;
+   bool ret;
 
    buffer = CALLOC_STRUCT(nv04_resource);
    if (!buffer)
@@ -683,7 +678,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
    }
    ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);
 
-   if (ret == FALSE)
+   if (ret == false)
       goto fail;
 
    if (buffer->domain == NOUVEAU_BO_VRAM && screen->hint_buf_keep_sysmem_copy)
@@ -730,20 +725,20 @@ nouveau_user_buffer_create(struct pipe_screen *pscreen, void *ptr,
    return &buffer->base;
 }
 
-static INLINE boolean
+static inline bool
 nouveau_buffer_data_fetch(struct nouveau_context *nv, struct nv04_resource *buf,
                           struct nouveau_bo *bo, unsigned offset, unsigned size)
 {
    if (!nouveau_buffer_malloc(buf))
-      return FALSE;
+      return false;
    if (nouveau_bo_map(bo, NOUVEAU_BO_RD, nv->client))
-      return FALSE;
+      return false;
    memcpy(buf->data, (uint8_t *)bo->map + offset, size);
-   return TRUE;
+   return true;
 }
 
 /* Migrate a linear buffer (vertex, index, constants) USER -> GART -> VRAM. */
-boolean
+bool
 nouveau_buffer_migrate(struct nouveau_context *nv,
                        struct nv04_resource *buf, const unsigned new_domain)
 {
@@ -758,7 +753,7 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
 
    if (new_domain == NOUVEAU_BO_GART && old_domain == 0) {
       if (!nouveau_buffer_allocate(screen, buf, new_domain))
-         return FALSE;
+         return false;
       ret = nouveau_bo_map(buf->bo, 0, nv->client);
       if (ret)
          return ret;
@@ -771,7 +766,7 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
       if (new_domain == NOUVEAU_BO_VRAM) {
          /* keep a system memory copy of our data in case we hit a fallback */
          if (!nouveau_buffer_data_fetch(nv, buf, buf->bo, buf->offset, size))
-            return FALSE;
+            return false;
          if (nouveau_mesa_debug)
             debug_printf("migrating %u KiB to VRAM\n", size / 1024);
       }
@@ -792,28 +787,28 @@ nouveau_buffer_migrate(struct nouveau_context *nv,
    if (new_domain == NOUVEAU_BO_VRAM && old_domain == 0) {
       struct nouveau_transfer tx;
       if (!nouveau_buffer_allocate(screen, buf, NOUVEAU_BO_VRAM))
-         return FALSE;
+         return false;
       tx.base.resource = &buf->base;
       tx.base.box.x = 0;
       tx.base.box.width = buf->base.width0;
       tx.bo = NULL;
       tx.map = NULL;
-      if (!nouveau_transfer_staging(nv, &tx, FALSE))
-         return FALSE;
+      if (!nouveau_transfer_staging(nv, &tx, false))
+         return false;
       nouveau_transfer_write(nv, &tx, 0, tx.base.box.width);
       nouveau_buffer_transfer_del(nv, &tx);
    } else
-      return FALSE;
+      return false;
 
    assert(buf->domain == new_domain);
-   return TRUE;
+   return true;
 }
 
 /* Migrate data from glVertexAttribPointer(non-VBO) user buffers to GART.
  * We'd like to only allocate @size bytes here, but then we'd have to rebase
  * the vertex indices ...
  */
-boolean
+bool
 nouveau_user_buffer_upload(struct nouveau_context *nv,
                            struct nv04_resource *buf,
                            unsigned base, unsigned size)
@@ -825,20 +820,20 @@ nouveau_user_buffer_upload(struct nouveau_context *nv,
 
    buf->base.width0 = base + size;
    if (!nouveau_buffer_reallocate(screen, buf, NOUVEAU_BO_GART))
-      return FALSE;
+      return false;
 
    ret = nouveau_bo_map(buf->bo, 0, nv->client);
    if (ret)
-      return FALSE;
+      return false;
    memcpy((uint8_t *)buf->bo->map + buf->offset + base, buf->data + base, size);
 
-   return TRUE;
+   return true;
 }
 
 
 /* Scratch data allocation. */
 
-static INLINE int
+static inline int
 nouveau_scratch_bo_alloc(struct nouveau_context *nv, struct nouveau_bo **pbo,
                          unsigned size)
 {
@@ -875,7 +870,7 @@ nouveau_scratch_runout_release(struct nouveau_context *nv)
 /* Allocate an extra bo if we can't fit everything we need simultaneously.
  * (Could happen for very large user arrays.)
  */
-static INLINE boolean
+static inline bool
 nouveau_scratch_runout(struct nouveau_context *nv, unsigned size)
 {
    int ret;
@@ -909,7 +904,7 @@ nouveau_scratch_runout(struct nouveau_context *nv, unsigned size)
 /* Continue to next scratch buffer, if available (no wrapping, large enough).
  * Allocate it if it has not yet been created.
  */
-static INLINE boolean
+static inline bool
 nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
 {
    struct nouveau_bo *bo;
@@ -917,14 +912,14 @@ nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
    const unsigned i = (nv->scratch.id + 1) % NOUVEAU_MAX_SCRATCH_BUFS;
 
    if ((size > nv->scratch.bo_size) || (i == nv->scratch.wrap))
-      return FALSE;
+      return false;
    nv->scratch.id = i;
 
    bo = nv->scratch.bo[i];
    if (!bo) {
       ret = nouveau_scratch_bo_alloc(nv, &bo, nv->scratch.bo_size);
       if (ret)
-         return FALSE;
+         return false;
       nv->scratch.bo[i] = bo;
    }
    nv->scratch.current = bo;
@@ -937,10 +932,10 @@ nouveau_scratch_next(struct nouveau_context *nv, unsigned size)
    return !ret;
 }
 
-static boolean
+static bool
 nouveau_scratch_more(struct nouveau_context *nv, unsigned min_size)
 {
-   boolean ret;
+   bool ret;
 
    ret = nouveau_scratch_next(nv, min_size);
    if (!ret)
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.h b/src/gallium/drivers/nouveau/nouveau_buffer.h
index de77f481da3..7e6a6cc804b 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.h
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.h
@@ -58,7 +58,7 @@ nouveau_copy_buffer(struct nouveau_context *,
                     struct nv04_resource *dst, unsigned dst_pos,
                     struct nv04_resource *src, unsigned src_pos, unsigned size);
 
-boolean
+bool
 nouveau_buffer_migrate(struct nouveau_context *,
                        struct nv04_resource *, unsigned domain);
 
@@ -66,20 +66,20 @@ void *
 nouveau_resource_map_offset(struct nouveau_context *, struct nv04_resource *,
                             uint32_t offset, uint32_t flags);
 
-static INLINE void
+static inline void
 nouveau_resource_unmap(struct nv04_resource *res)
 {
    /* no-op */
 }
 
-static INLINE struct nv04_resource *
+static inline struct nv04_resource *
 nv04_resource(struct pipe_resource *resource)
 {
    return (struct nv04_resource *)resource;
 }
 
 /* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
-static INLINE boolean
+static inline bool
 nouveau_resource_mapped_by_gpu(struct pipe_resource *resource)
 {
    return nv04_resource(resource)->domain != 0;
@@ -93,7 +93,7 @@ struct pipe_resource *
 nouveau_user_buffer_create(struct pipe_screen *screen, void *ptr,
                            unsigned bytes, unsigned usage);
 
-boolean
+bool
 nouveau_user_buffer_upload(struct nouveau_context *, struct nv04_resource *,
                            unsigned base, unsigned size);
 
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
index c2ba0159afe..24deb7ee4c0 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -13,7 +13,7 @@ struct nouveau_context {
    struct nouveau_client *client;
    struct nouveau_pushbuf *pushbuf;
 
-   boolean vbo_dirty;
+   bool vbo_dirty;
 
    void (*copy_data)(struct nouveau_context *,
                      struct nouveau_bo *dst, unsigned, unsigned,
@@ -53,7 +53,7 @@ struct nouveau_context {
    } stats;
 };
 
-static INLINE struct nouveau_context *
+static inline struct nouveau_context *
 nouveau_context(struct pipe_context *pipe)
 {
    return (struct nouveau_context *)pipe;
@@ -69,7 +69,7 @@ nouveau_scratch_runout_release(struct nouveau_context *);
  * because we don't want to un-bo_ref each allocation every time. This is less
  * work, and we need the wrap index anyway for extreme situations.
  */
-static INLINE void
+static inline void
 nouveau_scratch_done(struct nouveau_context *nv)
 {
    nv->scratch.wrap = nv->scratch.id;
@@ -84,7 +84,7 @@ void *
 nouveau_scratch_get(struct nouveau_context *, unsigned size, uint64_t *gpu_addr,
                     struct nouveau_bo **);
 
-static INLINE void
+static inline void
 nouveau_context_destroy(struct nouveau_context *ctx)
 {
    int i;
@@ -96,7 +96,7 @@ nouveau_context_destroy(struct nouveau_context *ctx)
    FREE(ctx);
 }
 
-static INLINE  void
+static inline  void
 nouveau_context_update_frame_stats(struct nouveau_context *nv)
 {
    nv->stats.buf_cache_frame <<= 1;
@@ -104,7 +104,7 @@ nouveau_context_update_frame_stats(struct nouveau_context *nv)
       nv->stats.buf_cache_count = 0;
       nv->stats.buf_cache_frame |= 1;
       if ((nv->stats.buf_cache_frame & 0xf) == 0xf)
-         nv->screen->hint_buf_keep_sysmem_copy = TRUE;
+         nv->screen->hint_buf_keep_sysmem_copy = true;
    }
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c
index 17a5174594d..abcdb479954 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -28,13 +28,13 @@
 #include <sched.h>
 #endif
 
-boolean
+bool
 nouveau_fence_new(struct nouveau_screen *screen, struct nouveau_fence **fence,
-                  boolean emit)
+                  bool emit)
 {
    *fence = CALLOC_STRUCT(nouveau_fence);
    if (!*fence)
-      return FALSE;
+      return false;
 
    (*fence)->screen = screen;
    (*fence)->ref = 1;
@@ -43,7 +43,7 @@ nouveau_fence_new(struct nouveau_screen *screen, struct nouveau_fence **fence,
    if (emit)
       nouveau_fence_emit(*fence);
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -58,7 +58,7 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence)
    }
 }
 
-boolean
+bool
 nouveau_fence_work(struct nouveau_fence *fence,
                    void (*func)(void *), void *data)
 {
@@ -66,16 +66,16 @@ nouveau_fence_work(struct nouveau_fence *fence,
 
    if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
       func(data);
-      return TRUE;
+      return true;
    }
 
    work = CALLOC_STRUCT(nouveau_fence_work);
    if (!work)
-      return FALSE;
+      return false;
    work->func = func;
    work->data = data;
    LIST_ADD(&work->list, &fence->work);
-   return TRUE;
+   return true;
 }
 
 void
@@ -132,7 +132,7 @@ nouveau_fence_del(struct nouveau_fence *fence)
 }
 
 void
-nouveau_fence_update(struct nouveau_screen *screen, boolean flushed)
+nouveau_fence_update(struct nouveau_screen *screen, bool flushed)
 {
    struct nouveau_fence *fence;
    struct nouveau_fence *next = NULL;
@@ -167,21 +167,21 @@ nouveau_fence_update(struct nouveau_screen *screen, boolean flushed)
 
 #define NOUVEAU_FENCE_MAX_SPINS (1 << 31)
 
-boolean
+bool
 nouveau_fence_signalled(struct nouveau_fence *fence)
 {
    struct nouveau_screen *screen = fence->screen;
 
    if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
-      return TRUE;
+      return true;
 
    if (fence->state >= NOUVEAU_FENCE_STATE_EMITTED)
-      nouveau_fence_update(screen, FALSE);
+      nouveau_fence_update(screen, false);
 
    return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED;
 }
 
-boolean
+bool
 nouveau_fence_wait(struct nouveau_fence *fence)
 {
    struct nouveau_screen *screen = fence->screen;
@@ -195,16 +195,16 @@ nouveau_fence_wait(struct nouveau_fence *fence)
 
    if (fence->state < NOUVEAU_FENCE_STATE_FLUSHED)
       if (nouveau_pushbuf_kick(screen->pushbuf, screen->pushbuf->channel))
-         return FALSE;
+         return false;
 
    if (fence == screen->fence.current)
       nouveau_fence_next(screen);
 
    do {
-      nouveau_fence_update(screen, FALSE);
+      nouveau_fence_update(screen, false);
 
       if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
-         return TRUE;
+         return true;
       if (!spins)
          NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1);
       spins++;
@@ -218,7 +218,7 @@ nouveau_fence_wait(struct nouveau_fence *fence)
                 fence->sequence,
                 screen->fence.sequence_ack, screen->fence.sequence);
 
-   return FALSE;
+   return false;
 }
 
 void
@@ -229,5 +229,5 @@ nouveau_fence_next(struct nouveau_screen *screen)
 
    nouveau_fence_ref(NULL, &screen->fence.current);
 
-   nouveau_fence_new(screen, &screen->fence.current, FALSE);
+   nouveau_fence_new(screen, &screen->fence.current, false);
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h
index 7bb132a5d15..a1587051b0f 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@@ -29,15 +29,15 @@ struct nouveau_fence {
 void nouveau_fence_emit(struct nouveau_fence *);
 void nouveau_fence_del(struct nouveau_fence *);
 
-boolean nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
-                          boolean emit);
-boolean nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
-void    nouveau_fence_update(struct nouveau_screen *, boolean flushed);
-void    nouveau_fence_next(struct nouveau_screen *);
-boolean nouveau_fence_wait(struct nouveau_fence *);
-boolean nouveau_fence_signalled(struct nouveau_fence *);
+bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
+                       bool emit);
+bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
+void nouveau_fence_update(struct nouveau_screen *, bool flushed);
+void nouveau_fence_next(struct nouveau_screen *);
+bool nouveau_fence_wait(struct nouveau_fence *);
+bool nouveau_fence_signalled(struct nouveau_fence *);
 
-static INLINE void
+static inline void
 nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref)
 {
    if (fence)
@@ -51,7 +51,7 @@ nouveau_fence_ref(struct nouveau_fence *fence, struct nouveau_fence **ref)
    *ref = fence;
 }
 
-static INLINE struct nouveau_fence *
+static inline struct nouveau_fence *
 nouveau_fence(struct pipe_fence_handle *fence)
 {
    return (struct nouveau_fence *)fence;
diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h
index ff97aaa9af0..1538c7b6e57 100644
--- a/src/gallium/drivers/nouveau/nouveau_gldefs.h
+++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h
@@ -1,7 +1,7 @@
 #ifndef __NOUVEAU_GLDEFS_H__
 #define __NOUVEAU_GLDEFS_H__
 
-static INLINE unsigned
+static inline unsigned
 nvgl_blend_func(unsigned factor)
 {
 	switch (factor) {
@@ -40,7 +40,7 @@ nvgl_blend_func(unsigned factor)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_blend_eqn(unsigned func)
 {
 	switch (func) {
@@ -59,7 +59,7 @@ nvgl_blend_eqn(unsigned func)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_logicop_func(unsigned func)
 {
 	switch (func) {
@@ -100,7 +100,7 @@ nvgl_logicop_func(unsigned func)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_comparison_op(unsigned op)
 {
 	switch (op) {
@@ -125,7 +125,7 @@ nvgl_comparison_op(unsigned op)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_polygon_mode(unsigned mode)
 {
 	switch (mode) {
@@ -140,7 +140,7 @@ nvgl_polygon_mode(unsigned mode)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_stencil_op(unsigned op)
 {
 	switch (op) {
@@ -165,7 +165,7 @@ nvgl_stencil_op(unsigned op)
 	}
 }
 
-static INLINE unsigned
+static inline unsigned
 nvgl_primitive(unsigned prim) {
 	switch (prim) {
 	case PIPE_PRIM_POINTS:
diff --git a/src/gallium/drivers/nouveau/nouveau_mm.c b/src/gallium/drivers/nouveau/nouveau_mm.c
index 9c454c56db0..43b3d99f48a 100644
--- a/src/gallium/drivers/nouveau/nouveau_mm.c
+++ b/src/gallium/drivers/nouveau/nouveau_mm.c
@@ -70,7 +70,7 @@ mm_slab_alloc(struct mm_slab *slab)
    return -1;
 }
 
-static INLINE void
+static inline void
 mm_slab_free(struct mm_slab *slab, int i)
 {
    assert(i < slab->count);
@@ -79,7 +79,7 @@ mm_slab_free(struct mm_slab *slab, int i)
    assert(slab->free <= slab->count);
 }
 
-static INLINE int
+static inline int
 mm_get_order(uint32_t size)
 {
    int s = __builtin_clz(size) ^ 31;
@@ -104,7 +104,7 @@ mm_bucket_by_size(struct nouveau_mman *cache, unsigned size)
 }
 
 /* size of bo allocation for slab with chunks of (1 << chunk_order) bytes */
-static INLINE uint32_t
+static inline uint32_t
 mm_default_slab_size(unsigned chunk_order)
 {
    static const int8_t slab_order[MM_MAX_ORDER - MM_MIN_ORDER + 1] =
@@ -263,7 +263,7 @@ nouveau_mm_create(struct nouveau_device *dev, uint32_t domain,
    return cache;
 }
 
-static INLINE void
+static inline void
 nouveau_mm_free_slabs(struct list_head *head)
 {
    struct mm_slab *slab, *next;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index c6e5074db19..b2290e7e784 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -67,18 +67,14 @@ nouveau_screen_fence_ref(struct pipe_screen *pscreen,
 	nouveau_fence_ref(nouveau_fence(pfence), (struct nouveau_fence **)ptr);
 }
 
-static boolean
-nouveau_screen_fence_signalled(struct pipe_screen *screen,
-                               struct pipe_fence_handle *pfence)
-{
-        return nouveau_fence_signalled(nouveau_fence(pfence));
-}
-
 static boolean
 nouveau_screen_fence_finish(struct pipe_screen *screen,
 			    struct pipe_fence_handle *pfence,
                             uint64_t timeout)
 {
+	if (!timeout)
+		return nouveau_fence_signalled(nouveau_fence(pfence));
+
 	return nouveau_fence_wait(nouveau_fence(pfence));
 }
 
@@ -115,7 +111,7 @@ nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
 }
 
 
-boolean
+bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 			     struct nouveau_bo *bo,
 			     unsigned stride,
@@ -127,11 +123,11 @@ nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 		return nouveau_bo_name_get(bo, &whandle->handle) == 0;
 	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
 		whandle->handle = bo->handle;
-		return TRUE;
+		return true;
 	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
 		return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0;
 	} else {
-		return FALSE;
+		return false;
 	}
 }
 
@@ -203,7 +199,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 	pscreen->get_timestamp = nouveau_screen_get_timestamp;
 
 	pscreen->fence_reference = nouveau_screen_fence_ref;
-	pscreen->fence_signalled = nouveau_screen_fence_signalled;
 	pscreen->fence_finish = nouveau_screen_fence_finish;
 
 	util_format_s3tc_init();
@@ -214,7 +209,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 		PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
 		PIPE_BIND_CURSOR |
 		PIPE_BIND_SAMPLER_VIEW |
-		PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE |
+		PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE |
+                PIPE_BIND_COMPUTE_RESOURCE |
 		PIPE_BIND_GLOBAL;
 	screen->sysmem_bindings =
 		PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index 30041b271c9..4fdde9fbf3d 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -49,7 +49,7 @@ struct nouveau_screen {
 
 	int64_t cpu_gpu_time_delta;
 
-	boolean hint_buf_keep_sysmem_copy;
+	bool hint_buf_keep_sysmem_copy;
 
 	unsigned vram_domain;
 
@@ -112,15 +112,15 @@ struct nouveau_screen {
 # define NOUVEAU_DRV_STAT_IFD(x)
 #endif
 
-static INLINE struct nouveau_screen *
+static inline struct nouveau_screen *
 nouveau_screen(struct pipe_screen *pscreen)
 {
 	return (struct nouveau_screen *)pscreen;
 }
 
-boolean nouveau_drm_screen_unref(struct nouveau_screen *screen);
+bool nouveau_drm_screen_unref(struct nouveau_screen *screen);
 
-boolean
+bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 			     struct nouveau_bo *bo,
 			     unsigned stride,
diff --git a/src/gallium/drivers/nouveau/nouveau_statebuf.h b/src/gallium/drivers/nouveau/nouveau_statebuf.h
index 4f8bd7bdf16..f38014091ba 100644
--- a/src/gallium/drivers/nouveau/nouveau_statebuf.h
+++ b/src/gallium/drivers/nouveau/nouveau_statebuf.h
@@ -20,7 +20,7 @@ struct nouveau_statebuf_builder
 #define sb_data(sb, v) *(sb).p++ = (v)
 #endif
 
-static INLINE uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size)
+static inline uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size)
 {
 	return (size << 18) | (subc << 13) | mthd;
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_video.c b/src/gallium/drivers/nouveau/nouveau_video.c
index d6330fa63a8..e414a534418 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_video.c
@@ -100,7 +100,7 @@ nouveau_vpe_fini(struct nouveau_decoder *dec) {
    dec->current = dec->future = dec->past = 8;
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_dct_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb)
 {
    int cbb;
@@ -125,7 +125,7 @@ nouveau_vpe_mb_dct_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_data_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12_macroblock *mb)
 {
    int cbb;
@@ -143,7 +143,7 @@ nouveau_vpe_mb_data_blocks(struct nouveau_decoder *dec, const struct pipe_mpeg12
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_dct_header(struct nouveau_decoder *dec,
                           const struct pipe_mpeg12_macroblock *mb,
                           bool luma)
@@ -187,7 +187,7 @@ nouveau_vpe_mb_dct_header(struct nouveau_decoder *dec,
                      x | (y << NV17_MPEG_CMD_MB_COORDS_Y__SHIFT));
 }
 
-static INLINE unsigned int
+static inline unsigned int
 nouveau_vpe_mb_mv_flags(bool luma, int mv_h, int mv_v, bool forward, bool first, bool vert)
 {
    unsigned mc_header = 0;
@@ -228,7 +228,7 @@ static int div_up(int val, int mult) {
    return val / mult;
 }
 
-static INLINE void
+static inline void
 nouveau_vpe_mb_mv(struct nouveau_decoder *dec, unsigned mc_header,
                    bool luma, bool frame, bool forward, bool vert,
                    int x, int y, const short motions[2],
@@ -296,16 +296,16 @@ nouveau_vpe_mb_mv_header(struct nouveau_decoder *dec,
       case PIPE_MPEG12_MO_TYPE_DUAL_PRIME: {
          base = NV17_MPEG_CMD_CHROMA_MV_HEADER_COUNT_2;
          if (forward) {
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, FALSE,
-                              x, y, mb->PMV[0][0], dec->past, TRUE);
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, TRUE,
-                              x, y2, mb->PMV[0][0], dec->past, FALSE);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true, false,
+                              x, y, mb->PMV[0][0], dec->past, true);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true, true,
+                              x, y2, mb->PMV[0][0], dec->past, false);
          }
          if (backward && forward) {
-            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, TRUE,
-                              x, y, mb->PMV[1][0], dec->future, TRUE);
-            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, FALSE,
-                              x, y2, mb->PMV[1][1], dec->future, FALSE);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, true,
+                              x, y, mb->PMV[1][0], dec->future, true);
+            nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, false,
+                              x, y2, mb->PMV[1][1], dec->future, false);
          } else assert(!backward);
          break;
       }
@@ -320,13 +320,13 @@ nouveau_vpe_mb_mv_header(struct nouveau_decoder *dec,
          if (frame)
             base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_TYPE_FRAME;
          if (forward)
-            nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+            nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                               dec->picture_structure != PIPE_MPEG12_PICTURE_STRUCTURE_FIELD_TOP,
-                              x, y, mb->PMV[0][0], dec->past, TRUE);
+                              x, y, mb->PMV[0][0], dec->past, true);
          if (backward && forward)
-            nouveau_vpe_mb_mv(dec, base, luma, frame, FALSE,
+            nouveau_vpe_mb_mv(dec, base, luma, frame, false,
                               dec->picture_structure == PIPE_MPEG12_PICTURE_STRUCTURE_FIELD_TOP,
-                              x, y, mb->PMV[0][1], dec->future, TRUE);
+                              x, y, mb->PMV[0][1], dec->future, true);
          else assert(!backward);
          break;
       }
@@ -341,11 +341,11 @@ mv1:
        base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_TYPE_FRAME;
     /* frame 16x16 */
    if (forward)
-       nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE, FALSE,
-                         x, y, mb->PMV[0][0], dec->past, TRUE);
+       nouveau_vpe_mb_mv(dec, base, luma, frame, true, false,
+                         x, y, mb->PMV[0][0], dec->past, true);
    if (backward)
-       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, FALSE,
-                         x, y, mb->PMV[0][1], dec->future, TRUE);
+       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward, false,
+                         x, y, mb->PMV[0][1], dec->future, true);
     return;
 
 mv2:
@@ -353,20 +353,20 @@ mv2:
    if (!frame)
       base |= NV17_MPEG_CMD_CHROMA_MV_HEADER_MV_SPLIT_HALF_MB;
    if (forward) {
-      nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+      nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_FIRST_FORWARD,
-                        x, y, mb->PMV[0][0], dec->past, TRUE);
-      nouveau_vpe_mb_mv(dec, base, luma, frame, TRUE,
+                        x, y, mb->PMV[0][0], dec->past, true);
+      nouveau_vpe_mb_mv(dec, base, luma, frame, true,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_SECOND_FORWARD,
-                        x, y2, mb->PMV[1][0], dec->past, FALSE);
+                        x, y2, mb->PMV[1][0], dec->past, false);
    }
    if (backward) {
       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_FIRST_BACKWARD,
-                        x, y, mb->PMV[0][1], dec->future, TRUE);
+                        x, y, mb->PMV[0][1], dec->future, true);
       nouveau_vpe_mb_mv(dec, base, luma, frame, !forward,
                         mb->motion_vertical_field_select & PIPE_MPEG12_FS_SECOND_BACKWARD,
-                        x, y2, mb->PMV[1][1], dec->future, FALSE);
+                        x, y2, mb->PMV[1][1], dec->future, false);
    }
 }
 
@@ -438,14 +438,14 @@ nouveau_decoder_decode_macroblock(struct pipe_video_codec *decoder,
    mb = (const struct pipe_mpeg12_macroblock *)pipe_mb;
    for (i = 0; i < num_macroblocks; ++i, mb++) {
       if (mb->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA) {
-         nouveau_vpe_mb_dct_header(dec, mb, TRUE);
-         nouveau_vpe_mb_dct_header(dec, mb, FALSE);
+         nouveau_vpe_mb_dct_header(dec, mb, true);
+         nouveau_vpe_mb_dct_header(dec, mb, false);
       } else {
-         nouveau_vpe_mb_mv_header(dec, mb, TRUE);
-         nouveau_vpe_mb_dct_header(dec, mb, TRUE);
+         nouveau_vpe_mb_mv_header(dec, mb, true);
+         nouveau_vpe_mb_dct_header(dec, mb, true);
 
-         nouveau_vpe_mb_mv_header(dec, mb, FALSE);
-         nouveau_vpe_mb_dct_header(dec, mb, FALSE);
+         nouveau_vpe_mb_mv_header(dec, mb, false);
+         nouveau_vpe_mb_dct_header(dec, mb, false);
       }
       if (dec->base.entrypoint <= PIPE_VIDEO_ENTRYPOINT_IDCT)
          nouveau_vpe_mb_dct_blocks(dec, mb);
diff --git a/src/gallium/drivers/nouveau/nouveau_video.h b/src/gallium/drivers/nouveau/nouveau_video.h
index 08d48b371fd..fd1bd527deb 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_video.h
@@ -45,7 +45,7 @@ struct nouveau_decoder {
 #define NV31_VIDEO_BIND_CMD     NV31_MPEG_IMAGE_Y_OFFSET__LEN
 #define NV31_VIDEO_BIND_COUNT  (NV31_MPEG_IMAGE_Y_OFFSET__LEN + 1)
 
-static INLINE void
+static inline void
 nouveau_vpe_write(struct nouveau_decoder *dec, unsigned data) {
    dec->cmds[dec->ofs++] = data;
 }
@@ -54,33 +54,33 @@ nouveau_vpe_write(struct nouveau_decoder *dec, unsigned data) {
 #define NV31_MPEG(mthd) SUBC_MPEG(NV31_MPEG_##mthd)
 #define NV84_MPEG(mthd) SUBC_MPEG(NV84_MPEG_##mthd)
 
-static INLINE uint32_t
+static inline uint32_t
 NV04_FIFO_PKHDR(int subc, int mthd, unsigned size)
 {
    return 0x00000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV04_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x40000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, NV04_FIFO_PKHDR(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, NV04_FIFO_PKHDR_NI(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd,
            struct nouveau_bo *bo, uint32_t offset,
 	   struct nouveau_bufctx *ctx, int bin, uint32_t rw)
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
index 279a1ce18ef..33e3bef3df3 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@@ -135,22 +135,22 @@ struct comm {
 	uint32_t parse_endpos[0x10]; // 1c0
 };
 
-static INLINE uint32_t nouveau_vp3_video_align(uint32_t h)
+static inline uint32_t nouveau_vp3_video_align(uint32_t h)
 {
    return ((h+0x3f)&~0x3f);
 };
 
-static INLINE uint32_t mb(uint32_t coord)
+static inline uint32_t mb(uint32_t coord)
 {
    return (coord + 0xf)>>4;
 }
 
-static INLINE uint32_t mb_half(uint32_t coord)
+static inline uint32_t mb_half(uint32_t coord)
 {
    return (coord + 0x1f)>>5;
 }
 
-static INLINE uint64_t
+static inline uint64_t
 nouveau_vp3_video_addr(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video_buffer *target)
 {
    uint64_t ret;
@@ -161,7 +161,7 @@ nouveau_vp3_video_addr(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video
    return dec->ref_bo->offset + ret;
 }
 
-static INLINE void
+static inline void
 nouveau_vp3_ycbcr_offsets(struct nouveau_vp3_decoder *dec, uint32_t *y2,
                           uint32_t *cbcr, uint32_t *cbcr2)
 {
@@ -182,7 +182,7 @@ nouveau_vp3_ycbcr_offsets(struct nouveau_vp3_decoder *dec, uint32_t *y2,
    }
 }
 
-static INLINE void
+static inline void
 nouveau_vp3_inter_sizes(struct nouveau_vp3_decoder *dec, uint32_t slice_count,
                         uint32_t *slice_size, uint32_t *bucket_size,
                         uint32_t *ring_size)
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 51effb1d8d2..389a229eb78 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -15,34 +15,34 @@
 #define NOUVEAU_MIN_BUFFER_MAP_ALIGN      64
 #define NOUVEAU_MIN_BUFFER_MAP_ALIGN_MASK (NOUVEAU_MIN_BUFFER_MAP_ALIGN - 1)
 
-static INLINE uint32_t
+static inline uint32_t
 PUSH_AVAIL(struct nouveau_pushbuf *push)
 {
    return push->end - push->cur;
 }
 
-static INLINE boolean
+static inline bool
 PUSH_SPACE(struct nouveau_pushbuf *push, uint32_t size)
 {
    if (PUSH_AVAIL(push) < size)
       return nouveau_pushbuf_space(push, size, 0, 0) == 0;
-   return TRUE;
+   return true;
 }
 
-static INLINE void
+static inline void
 PUSH_DATA(struct nouveau_pushbuf *push, uint32_t data)
 {
    *push->cur++ = data;
 }
 
-static INLINE void
+static inline void
 PUSH_DATAp(struct nouveau_pushbuf *push, const void *data, uint32_t size)
 {
    memcpy(push->cur, data, size * 4);
    push->cur += size;
 }
 
-static INLINE void
+static inline void
 PUSH_DATAf(struct nouveau_pushbuf *push, float f)
 {
    union { float f; uint32_t i; } u;
@@ -50,7 +50,7 @@ PUSH_DATAf(struct nouveau_pushbuf *push, float f)
    PUSH_DATA(push, u.i);
 }
 
-static INLINE void
+static inline void
 PUSH_KICK(struct nouveau_pushbuf *push)
 {
    nouveau_pushbuf_kick(push, push->channel);
@@ -60,7 +60,7 @@ PUSH_KICK(struct nouveau_pushbuf *push)
 #define NOUVEAU_RESOURCE_FLAG_LINEAR   (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
 #define NOUVEAU_RESOURCE_FLAG_DRV_PRIV (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 
-static INLINE uint32_t
+static inline uint32_t
 nouveau_screen_transfer_flags(unsigned pipe)
 {
 	uint32_t flags = 0;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
index 447f4b3b7ae..95468e580dd 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
@@ -1459,6 +1459,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define NV40_3D_VTX_CACHE_INVALIDATE				0x00001714
 
+#define NV40_3D_VB_ELEMENT_BASE					0x0000173c
+
 #define NV30_3D_VTXFMT(i0)				       (0x00001740 + 0x4*(i0))
 #define NV30_3D_VTXFMT__ESIZE					0x00000004
 #define NV30_3D_VTXFMT__LEN					0x00000010
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
index 83fd1fa38dd..118cac77277 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
@@ -32,7 +32,7 @@
 #include "nv30/nv30_context.h"
 #include "nv30/nv30_format.h"
 
-static INLINE uint32_t
+static inline uint32_t
 pack_rgba(enum pipe_format format, const float *rgba)
 {
    union util_color uc;
@@ -40,7 +40,7 @@ pack_rgba(enum pipe_format format, const float *rgba)
    return uc.ui[0];
 }
 
-static INLINE uint32_t
+static inline uint32_t
 pack_zeta(enum pipe_format format, double depth, unsigned stencil)
 {
    uint32_t zuint = (uint32_t)(depth * 4294967295.0);
@@ -58,7 +58,7 @@ nv30_clear(struct pipe_context *pipe, unsigned buffers,
    struct pipe_framebuffer_state *fb = &nv30->framebuffer;
    uint32_t colr = 0, zeta = 0, mode = 0;
 
-   if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, TRUE))
+   if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, true))
       return;
 
    if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index 617b0887810..6e88ed725d6 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -45,7 +45,7 @@ nv30_context_kick_notify(struct nouveau_pushbuf *push)
    screen = &nv30->screen->base;
 
    nouveau_fence_next(screen);
-   nouveau_fence_update(screen, TRUE);
+   nouveau_fence_update(screen, true);
 
    if (push->bufctx) {
       struct nouveau_bufref *bref;
@@ -165,6 +165,12 @@ nv30_context_destroy(struct pipe_context *pipe)
    if (nv30->draw)
       draw_destroy(nv30->draw);
 
+   if (nv30->blit_vp)
+      nouveau_heap_free(&nv30->blit_vp);
+
+   if (nv30->blit_fp)
+      pipe_resource_reference(&nv30->blit_fp, NULL);
+
    if (nv30->screen->base.pushbuf->user_priv == &nv30->bufctx)
       nv30->screen->base.pushbuf->user_priv = NULL;
 
@@ -233,7 +239,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv)
 
    nv30->config.aniso = NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_OFF;
 
-   if (debug_get_bool_option("NV30_SWTNL", FALSE))
+   if (debug_get_bool_option("NV30_SWTNL", false))
       nv30->draw_flags |= NV30_NEW_SWTNL;
 
    nv30->sample_mask = 0xffff;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 592cdbe24f9..d5c18bb62dc 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -51,7 +51,8 @@ struct nv30_context {
       unsigned rt_enable;
       unsigned scissor_off;
       unsigned num_vtxelts;
-      boolean  prim_restart;
+      int index_bias;
+      bool prim_restart;
       struct nv30_fragprog *fragprog;
    } state;
 
@@ -114,17 +115,17 @@ struct nv30_context {
    uint32_t vbo_user;
    unsigned vbo_min_index;
    unsigned vbo_max_index;
-   boolean  vbo_push_hint;
+   bool vbo_push_hint;
 
    struct nouveau_heap  *blit_vp;
    struct pipe_resource *blit_fp;
 
    struct pipe_query *render_cond_query;
    unsigned render_cond_mode;
-   boolean render_cond_cond;
+   bool render_cond_cond;
 };
 
-static INLINE struct nv30_context *
+static inline struct nv30_context *
 nv30_context(struct pipe_context *pipe)
 {
    return (struct nv30_context *)pipe;
@@ -203,8 +204,8 @@ nv30_draw_init(struct pipe_context *pipe);
 void
 nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
 
-boolean
-nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl);
+bool
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, bool hwtnl);
 
 void
 nv30_state_release(struct nv30_context *nv30);
@@ -213,7 +214,7 @@ nv30_state_release(struct nv30_context *nv30);
 #define NV30_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV30_3D_VERTEX_BEGIN_END_##n
 
-static INLINE unsigned
+static inline unsigned
 nv30_prim_gl(unsigned prim)
 {
    switch (prim) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index c1665b7ad2f..098d6e499fa 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -52,7 +52,7 @@ struct nv30_render {
    uint32_t prim;
 };
 
-static INLINE struct nv30_render *
+static inline struct nv30_render *
 nv30_render(struct vbuf_render *render)
 {
    return (struct nv30_render *)render;
@@ -79,12 +79,12 @@ nv30_render_allocate_vertices(struct vbuf_render *render,
                                      PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM,
                                      render->max_vertex_buffer_bytes);
       if (!r->buffer)
-         return FALSE;
+         return false;
 
       r->offset = 0;
    }
 
-   return TRUE;
+   return true;
 }
 
 static void *
@@ -134,7 +134,7 @@ nv30_render_draw_elements(struct vbuf_render *render,
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, ~0, FALSE))
+   if (!nv30_state_validate(nv30, ~0, false))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -179,7 +179,7 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, ~0, FALSE))
+   if (!nv30_state_validate(nv30, ~0, false))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -221,7 +221,7 @@ static const struct {
    [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
 };
 
-static boolean
+static bool
 vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
 {
    struct nv30_screen *screen = r->nv30->screen;
@@ -245,7 +245,7 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
    }
 
    if (emit == EMIT_OMIT)
-      return FALSE;
+      return false;
 
    draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
    format = draw_translate_vinfo_format(emit);
@@ -272,10 +272,10 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
       assert(sem == TGSI_SEMANTIC_TEXCOORD);
       *idx = 0x00001000 << (result - 8);
    }
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nv30_render_validate(struct nv30_context *nv30)
 {
    struct nv30_render *r = nv30_render(nv30->draw->render);
@@ -300,7 +300,7 @@ nv30_render_validate(struct nv30_context *nv30)
          }
 
          if (nouveau_heap_alloc(heap, 16, &r->vertprog, &r->vertprog))
-            return FALSE;
+            return false;
       }
    }
 
@@ -370,7 +370,7 @@ nv30_render_validate(struct nv30_context *nv30)
    }
 
    vinfo->size /= 4;
-   return TRUE;
+   return true;
 }
 
 void
@@ -519,6 +519,6 @@ nv30_draw_init(struct pipe_context *pipe)
    draw_set_rasterize_stage(draw, stage);
    draw_wide_line_threshold(draw, 10000000.f);
    draw_wide_point_threshold(draw, 10000000.f);
-   draw_wide_point_sprites(draw, TRUE);
+   draw_wide_point_sprites(draw, true);
    nv30->draw = draw;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_format.h b/src/gallium/drivers/nouveau/nv30/nv30_format.h
index 8bf4a37299f..fa1e922fb65 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_format.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_format.h
@@ -27,28 +27,28 @@ struct nv30_texfmt {
 };
 
 extern const struct nv30_format_info nv30_format_info_table[];
-static INLINE const struct nv30_format_info *
+static inline const struct nv30_format_info *
 nv30_format_info(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_format_info_table[format];
 }
 
 extern const struct nv30_format nv30_format_table[];
-static INLINE const struct nv30_format *
+static inline const struct nv30_format *
 nv30_format(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_format_table[format];
 }
 
 extern const struct nv30_vtxfmt nv30_vtxfmt_table[];
-static INLINE const struct nv30_vtxfmt *
+static inline const struct nv30_vtxfmt *
 nv30_vtxfmt(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_vtxfmt_table[format];
 }
 
 extern const struct nv30_texfmt nv30_texfmt_table[];
-static INLINE const struct nv30_texfmt *
+static inline const struct nv30_texfmt *
 nv30_texfmt(struct pipe_screen *pscreen, enum pipe_format format)
 {
    return &nv30_texfmt_table[format];
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
index 7f227868f73..6de61bcc1c0 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -37,22 +37,26 @@ nv30_fragprog_upload(struct nv30_context *nv30)
    struct nouveau_context *nv = &nv30->base;
    struct nv30_fragprog *fp = nv30->fragprog.program;
    struct pipe_context *pipe = &nv30->base.pipe;
-   struct pipe_transfer *transfer;
-   uint32_t *map;
-   int i; (void)i;
 
-   if (unlikely(!fp->buffer)) {
+   if (unlikely(!fp->buffer))
       fp->buffer = pipe_buffer_create(pipe->screen, 0, 0, fp->insn_len * 4);
-   }
 
-   map = pipe_buffer_map(pipe, fp->buffer, PIPE_TRANSFER_WRITE, &transfer);
 #ifndef PIPE_ARCH_BIG_ENDIAN
-   memcpy(map, fp->insn, fp->insn_len * 4);
+   pipe_buffer_write(pipe, fp->buffer, 0, fp->insn_len * 4, fp->insn);
 #else
-   for (i = 0; i < fp->insn_len; i++)
-      *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16);
+   {
+      struct pipe_transfer *transfer;
+      uint32_t *map;
+      int i;
+
+      map = pipe_buffer_map(pipe, fp->buffer,
+                            PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE,
+                            &transfer);
+      for (i = 0; i < fp->insn_len; i++)
+         *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16);
+      pipe_buffer_unmap(pipe, transfer);
+   }
 #endif
-   pipe_buffer_unmap(pipe, transfer);
 
    if (nv04_resource(fp->buffer)->domain != NOUVEAU_BO_VRAM)
       nouveau_buffer_migrate(nv, nv04_resource(fp->buffer), NOUVEAU_BO_VRAM);
@@ -64,7 +68,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    struct nv30_fragprog *fp = nv30->fragprog.program;
-   boolean upload = FALSE;
+   bool upload = false;
    int i;
 
    if (!fp->translated) {
@@ -72,7 +76,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
       if (!fp->translated)
          return;
 
-      upload = TRUE;
+      upload = true;
    }
 
    /* update constants, also needs to be done on every fp switch as we
@@ -89,7 +93,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
          if (!memcmp(&fp->insn[off], &cbuf[idx], 4 * 4))
             continue;
          memcpy(&fp->insn[off], &cbuf[idx], 4 * 4);
-         upload = TRUE;
+         upload = true;
       }
    }
 
@@ -161,8 +165,15 @@ static void
 nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
 {
    struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_fragprog *fp = hwcso;
 
-   nv30->fragprog.program = hwcso;
+   /* reset the bucftx so that we don't keep a dangling reference to the fp
+    * code
+    */
+   if (fp != nv30->state.fragprog)
+      PUSH_RESET(nv30->base.pushbuf, BUFCTX_FRAGPROG);
+
+   nv30->fragprog.program = fp;
    nv30->dirty |= NV30_NEW_FRAGPROG;
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
index 1a4b8929c0f..c75b4b95fd8 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
@@ -33,7 +33,7 @@
 #include "nv30/nv30_resource.h"
 #include "nv30/nv30_transfer.h"
 
-static INLINE unsigned
+static inline unsigned
 layer_offset(struct pipe_resource *pt, unsigned level, unsigned layer)
 {
    struct nv30_miptree *mt = nv30_miptree(pt);
@@ -54,7 +54,7 @@ nv30_miptree_get_handle(struct pipe_screen *pscreen,
    unsigned stride;
 
    if (!mt || !mt->base.bo)
-      return FALSE;
+      return false;
 
    stride = mt->level[0].pitch;
 
@@ -78,13 +78,13 @@ struct nv30_transfer {
    unsigned nblocksy;
 };
 
-static INLINE struct nv30_transfer *
+static inline struct nv30_transfer *
 nv30_transfer(struct pipe_transfer *ptx)
 {
    return (struct nv30_transfer *)ptx;
 }
 
-static INLINE void
+static inline void
 define_rect(struct pipe_resource *pt, unsigned level, unsigned z,
             unsigned x, unsigned y, unsigned w, unsigned h,
             struct nv30_rect *rect)
@@ -242,8 +242,8 @@ nv30_miptree_transfer_map(struct pipe_context *pipe, struct pipe_resource *pt,
    tx->base.level = level;
    tx->base.usage = usage;
    tx->base.box = *box;
-   tx->base.stride = util_format_get_nblocksx(pt->format, box->width) *
-                     util_format_get_blocksize(pt->format);
+   tx->base.stride = align(util_format_get_nblocksx(pt->format, box->width) *
+                           util_format_get_blocksize(pt->format), 64);
    tx->base.layer_stride = util_format_get_nblocksy(pt->format, box->height) *
                            tx->base.stride;
 
@@ -372,7 +372,7 @@ nv30_miptree_create(struct pipe_screen *pscreen,
    }
 
    if (!mt->uniform_pitch)
-      mt->swizzled = TRUE;
+      mt->swizzled = true;
 
    size = 0;
    for (l = 0; l <= pt->last_level; l++) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_push.c b/src/gallium/drivers/nouveau/nv30/nv30_push.c
index e0734fa70d3..67ab0508c17 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_push.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_push.c
@@ -47,12 +47,12 @@ struct push_context {
 
    struct translate *translate;
 
-   boolean primitive_restart;
+   bool primitive_restart;
    uint32_t prim;
    uint32_t restart_index;
 };
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -62,7 +62,7 @@ prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -72,7 +72,7 @@ prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
@@ -199,7 +199,7 @@ nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info)
 {
    struct push_context ctx;
    unsigned i, index_size;
-   boolean apply_bias = info->indexed && info->index_bias;
+   bool apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv30->base.pushbuf;
    ctx.translate = nv30->vertex->translate;
@@ -241,7 +241,7 @@ nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info)
    } else {
       ctx.idxbuf = NULL;
       index_size = 0;
-      ctx.primitive_restart = FALSE;
+      ctx.primitive_restart = false;
       ctx.restart_index = 0;
    }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c
index 516ee83168e..3980be9579a 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_query.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c
@@ -98,7 +98,7 @@ struct nv30_query {
    uint64_t result;
 };
 
-static INLINE struct nv30_query *
+static inline struct nv30_query *
 nv30_query(struct pipe_query *pipe)
 {
    return (struct nv30_query *)pipe;
@@ -208,7 +208,7 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    if (ntfy1) {
       while (ntfy1[3] & 0xff000000) {
          if (!wait)
-            return FALSE;
+            return false;
       }
 
       switch (q->type) {
@@ -228,7 +228,7 @@ nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    }
 
    *res64 = q->result;
-   return TRUE;
+   return true;
 }
 
 static void
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.c b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
index 38fac8af898..a98a6464de8 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
@@ -42,12 +42,12 @@ nv30_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nv30->vtxbuf[i].buffer)
             continue;
          if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nv30->base.vbo_dirty = TRUE;
+            nv30->base.vbo_dirty = true;
       }
 
       if (nv30->idxbuf.buffer &&
           nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nv30->base.vbo_dirty = TRUE;
+         nv30->base.vbo_dirty = true;
    }
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.h b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
index 1981c8d9ab9..8dac7795c9d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
@@ -15,7 +15,7 @@ struct nv30_surface {
    uint16_t depth;
 };
 
-static INLINE struct nv30_surface *
+static inline struct nv30_surface *
 nv30_surface(struct pipe_surface *ps)
 {
    return (struct nv30_surface *)ps;
@@ -32,13 +32,13 @@ struct nv30_miptree {
    struct nv30_miptree_level level[13];
    uint32_t uniform_pitch;
    uint32_t layer_size;
-   boolean swizzled;
+   bool swizzled;
    unsigned ms_mode;
    unsigned ms_x:1;
    unsigned ms_y:1;
 };
 
-static INLINE struct nv30_miptree *
+static inline struct nv30_miptree *
 nv30_miptree(struct pipe_resource *pt)
 {
    return (struct nv30_miptree *)pt;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 2e38a1978ae..7aad26ba18b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -69,6 +69,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return PIPE_ENDIAN_LITTLE;
    case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
       return 16;
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return NOUVEAU_MIN_BUFFER_MAP_ALIGN;
    case PIPE_CAP_MAX_VIEWPORTS:
       return 1;
    case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
@@ -96,6 +98,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return 1;
+   /* nv35 capabilities */
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
+      return eng3d->oclass == NV35_3D_CLASS || eng3d->oclass >= NV40_3D_CLASS;
    /* nv4x capabilities */
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
    case PIPE_CAP_NPOT_TEXTURES:
@@ -135,7 +140,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
    case PIPE_CAP_START_INSTANCE:
    case PIPE_CAP_TEXTURE_MULTISAMPLE:
-   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
    case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
@@ -162,6 +166,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -313,12 +320,12 @@ nv30_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 4)
-      return FALSE;
+      return false;
    if (!(0x00000017 & (1 << sample_count)))
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings)) {
-      return FALSE;
+      return false;
    }
 
    /* transfers & shared are always supported */
@@ -656,6 +663,6 @@ nv30_screen_create(struct nouveau_device *dev)
 
    nouveau_pushbuf_kick(push, push->channel);
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
    return pscreen;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.h b/src/gallium/drivers/nouveau/nv30/nv30_screen.h
index 3f2e47fec99..7b17b88097c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.h
@@ -40,7 +40,7 @@ struct nv30_screen {
    struct nouveau_heap *vp_data_heap;
 };
 
-static INLINE struct nv30_screen *
+static inline struct nv30_screen *
 nv30_screen(struct pipe_screen *pscreen)
 {
    return (struct nv30_screen *)pscreen;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.c b/src/gallium/drivers/nouveau/nv30/nv30_state.c
index 708ba34c1e5..fd604c2266d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.c
@@ -211,6 +211,7 @@ static void *
 nv30_zsa_state_create(struct pipe_context *pipe,
                       const struct pipe_depth_stencil_alpha_state *cso)
 {
+   struct nouveau_object *eng3d = nv30_context(pipe)->screen->eng3d;
    struct nv30_zsa_stateobj *so;
 
    so = CALLOC_STRUCT(nv30_zsa_stateobj);
@@ -223,6 +224,13 @@ nv30_zsa_state_create(struct pipe_context *pipe,
    SB_DATA  (so, cso->depth.writemask);
    SB_DATA  (so, cso->depth.enabled);
 
+   if (eng3d->oclass == NV35_3D_CLASS || eng3d->oclass >= NV40_3D_CLASS) {
+      SB_MTHD35(so, DEPTH_BOUNDS_TEST_ENABLE, 3);
+      SB_DATA  (so, cso->depth.bounds_test);
+      SB_DATA  (so, fui(cso->depth.bounds_min));
+      SB_DATA  (so, fui(cso->depth.bounds_max));
+   }
+
    if (cso->stencil[0].enabled) {
       SB_MTHD30(so, STENCIL_ENABLE(0), 3);
       SB_DATA  (so, 1);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.h b/src/gallium/drivers/nouveau/nv30/nv30_state.h
index e27e16fae82..ed3b8103a00 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.h
@@ -13,6 +13,8 @@
 #define SB_DATA(so, u)        (so)->data[(so)->size++] = (u)
 #define SB_MTHD30(so, mthd, size)                                          \
    SB_DATA((so), ((size) << 18) | (7 << 13) | NV30_3D_##mthd)
+#define SB_MTHD35(so, mthd, size)                                          \
+   SB_DATA((so), ((size) << 18) | (7 << 13) | NV35_3D_##mthd)
 #define SB_MTHD40(so, mthd, size)                                          \
    SB_DATA((so), ((size) << 18) | (7 << 13) | NV40_3D_##mthd)
 
@@ -30,7 +32,7 @@ struct nv30_rasterizer_stateobj {
 
 struct nv30_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
-   unsigned data[32];
+   unsigned data[36];
    unsigned size;
 };
 
@@ -80,7 +82,7 @@ struct nv30_vertprog {
    struct tgsi_shader_info info;
 
    struct draw_vertex_shader *draw;
-   boolean translated;
+   bool translated;
    unsigned enabled_ucps;
    uint16_t texcoord[10];
 
@@ -109,7 +111,7 @@ struct nv30_fragprog {
    struct tgsi_shader_info info;
 
    struct draw_fragment_shader *draw;
-   boolean translated;
+   bool translated;
 
    uint32_t *insn;
    unsigned insn_len;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
index a954dcce562..8957634f0fa 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
@@ -453,8 +453,8 @@ nv30_state_context_switch(struct nv30_context *nv30)
    nv30->base.pushbuf->user_priv = &nv30->bufctx;
 }
 
-boolean
-nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
+bool
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, bool hwtnl)
 {
    struct nouveau_screen *screen = &nv30->screen->base;
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
@@ -494,7 +494,7 @@ nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
    nouveau_pushbuf_bufctx(push, bctx);
    if (nouveau_pushbuf_validate(push)) {
       nouveau_pushbuf_bufctx(push, NULL);
-      return FALSE;
+      return false;
    }
 
    /*XXX*/
@@ -528,7 +528,7 @@ nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
       }
    }
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_texture.c b/src/gallium/drivers/nouveau/nv30/nv30_texture.c
index c3567217442..bfe21cceaa2 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_texture.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_texture.c
@@ -37,7 +37,7 @@
 #define NV40_WRAP(n) \
    case PIPE_TEX_WRAP_##n: ret = NV40_3D_TEX_WRAP_S_##n; break
 
-static INLINE unsigned
+static inline unsigned
 wrap_mode(unsigned pipe)
 {
    unsigned ret = NV30_3D_TEX_WRAP_S_REPEAT;
@@ -58,7 +58,7 @@ wrap_mode(unsigned pipe)
    return ret >> NV30_3D_TEX_WRAP_S__SHIFT;
 }
 
-static INLINE unsigned
+static inline unsigned
 filter_mode(const struct pipe_sampler_state *cso)
 {
    unsigned filter;
@@ -104,7 +104,7 @@ filter_mode(const struct pipe_sampler_state *cso)
    return filter;
 }
 
-static INLINE unsigned
+static inline unsigned
 compare_mode(const struct pipe_sampler_state *cso)
 {
    if (cso->compare_mode != PIPE_TEX_COMPARE_R_TO_TEXTURE)
@@ -201,7 +201,7 @@ nv30_bind_sampler_states(struct pipe_context *pipe,
    }
 }
 
-static INLINE uint32_t
+static inline uint32_t
 swizzle(const struct nv30_texfmt *fmt, unsigned cmp, unsigned swz)
 {
    uint32_t data = fmt->swz[swz].src << 8;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
index 99bc0994ac2..214da6568c3 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
@@ -41,33 +41,33 @@
  * of different ways.
  */
 
-static INLINE boolean
+static inline bool
 nv30_transfer_scaled(struct nv30_rect *src, struct nv30_rect *dst)
 {
    if (src->x1 - src->x0 != dst->x1 - dst->x0)
-      return TRUE;
+      return true;
    if (src->y1 - src->y0 != dst->y1 - dst->y0)
-      return TRUE;
-   return FALSE;
+      return true;
+   return false;
 }
 
-static INLINE boolean
+static inline bool
 nv30_transfer_blit(XFER_ARGS)
 {
    if (nv30->screen->eng3d->oclass < NV40_3D_CLASS)
-      return FALSE;
+      return false;
    if (dst->offset & 63 || dst->pitch & 63 || dst->d > 1)
-      return FALSE;
+      return false;
    if (dst->w < 2 || dst->h < 2)
-      return FALSE;
+      return false;
    if (dst->cpp > 4 || (dst->cpp == 1 && !dst->pitch))
-      return FALSE;
+      return false;
    if (src->cpp > 4)
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
-static INLINE struct nouveau_heap *
+static inline struct nouveau_heap *
 nv30_transfer_rect_vertprog(struct nv30_context *nv30)
 {
    struct nouveau_heap *heap = nv30->screen->vp_exec_heap;
@@ -108,7 +108,7 @@ nv30_transfer_rect_vertprog(struct nv30_context *nv30)
 }
 
 
-static INLINE struct nv04_resource *
+static inline struct nv04_resource *
 nv30_transfer_rect_fragprog(struct nv30_context *nv30)
 {
    struct nv04_resource *fp = nv04_resource(nv30->blit_fp);
@@ -368,29 +368,29 @@ nv30_transfer_rect_blit(XFER_ARGS)
    PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
 }
 
-static boolean
+static bool
 nv30_transfer_sifm(XFER_ARGS)
 {
    if (!src->pitch || (src->w | src->h) > 1024 || src->w < 2 || src->h < 2)
-      return FALSE;
+      return false;
 
    if (src->d > 1 || dst->d > 1)
-      return FALSE;
+      return false;
 
    if (dst->offset & 63)
-      return FALSE;
+      return false;
 
    if (!dst->pitch) {
       if ((dst->w | dst->h) > 2048 || dst->w < 2 || dst->h < 2)
-         return FALSE;
+         return false;
    } else {
       if (dst->domain != NOUVEAU_BO_VRAM)
-         return FALSE;
+         return false;
       if (dst->pitch & 63)
-         return FALSE;
+         return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -481,14 +481,14 @@ nv30_transfer_rect_sifm(XFER_ARGS)
  * that name is still accurate on nv4x) error.
  */
 
-static boolean
+static bool
 nv30_transfer_m2mf(XFER_ARGS)
 {
    if (!src->pitch || !dst->pitch)
-      return FALSE;
+      return false;
    if (nv30_transfer_scaled(src, dst))
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
 static void
@@ -540,12 +540,12 @@ nv30_transfer_rect_m2mf(XFER_ARGS)
    }
 }
 
-static boolean
+static bool
 nv30_transfer_cpu(XFER_ARGS)
 {
    if (nv30_transfer_scaled(src, dst))
-      return FALSE;
-   return TRUE;
+      return false;
+   return true;
 }
 
 static char *
@@ -554,7 +554,7 @@ linear_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
    return base + (y * rect->pitch) + (x * rect->cpp);
 }
 
-static INLINE unsigned
+static inline unsigned
 swizzle2d(unsigned v, unsigned s)
 {
    v = (v | (v << 8)) & 0x00ff00ff;
@@ -614,7 +614,7 @@ swizzle3d_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
 
 typedef char *(*get_ptr_t)(struct nv30_rect *, char *, int, int, int);
 
-static INLINE get_ptr_t
+static inline get_ptr_t
 get_ptr(struct nv30_rect *rect)
 {
    if (rect->pitch)
@@ -653,7 +653,7 @@ nv30_transfer_rect(struct nv30_context *nv30, enum nv30_transfer_filter filter,
 {
    static const struct {
       char *name;
-      boolean (*possible)(XFER_ARGS);
+      bool (*possible)(XFER_ARGS);
       void (*execute)(XFER_ARGS);
    } *method, methods[] = {
       { "m2mf", nv30_transfer_m2mf, nv30_transfer_rect_m2mf },
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index d4e384b21d2..8494549e9b1 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -79,7 +79,7 @@ nv30_emit_vtxattr(struct nv30_context *nv30, struct pipe_vertex_buffer *vb,
    }
 }
 
-static INLINE void
+static inline void
 nv30_vbuf_range(struct nv30_context *nv30, int vbi,
                 uint32_t *base, uint32_t *size)
 {
@@ -119,7 +119,7 @@ nv30_prevalidate_vbufs(struct nv30_context *nv30)
             } else {
                nouveau_buffer_migrate(&nv30->base, buf, NOUVEAU_BO_GART);
             }
-            nv30->base.vbo_dirty = TRUE;
+            nv30->base.vbo_dirty = true;
          }
       }
    }
@@ -160,10 +160,10 @@ nv30_update_user_vbufs(struct nv30_context *nv30)
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD,
                        0, NV30_3D_VTXBUF_DMA1);
    }
-   nv30->base.vbo_dirty = TRUE;
+   nv30->base.vbo_dirty = true;
 }
 
-static INLINE void
+static inline void
 nv30_release_user_vbufs(struct nv30_context *nv30)
 {
    uint32_t vbo_user = nv30->vbo_user;
@@ -202,6 +202,9 @@ nv30_vbo_validate(struct nv30_context *nv30)
       return;
 
    redefine = MAX2(vertex->num_elements, nv30->state.num_vtxelts);
+   if (redefine == 0)
+      return;
+
    BEGIN_NV04(push, NV30_3D(VTXFMT(0)), redefine);
 
    for (i = 0; i < vertex->num_elements; i++) {
@@ -221,7 +224,7 @@ nv30_vbo_validate(struct nv30_context *nv30)
    for (i = 0; i < vertex->num_elements; i++) {
       struct nv04_resource *res;
       unsigned offset;
-      boolean user;
+      bool user;
 
       ve = &vertex->pipe[i];
       vb = &nv30->vtxbuf[ve->vertex_buffer_index];
@@ -254,14 +257,12 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements,
     struct translate_key transkey;
     unsigned i;
 
-    assert(num_elements);
-
     so = MALLOC(sizeof(*so) + sizeof(*so->element) * num_elements);
     if (!so)
         return NULL;
     memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
     so->num_elements = num_elements;
-    so->need_conversion = FALSE;
+    so->need_conversion = false;
 
     transkey.nr_elements = 0;
     transkey.output_stride = 0;
@@ -284,7 +285,7 @@ nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements,
                 return NULL;
             }
             so->element[i].state = nv30_vtxfmt(pipe->screen, fmt)->hw;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
 
         if (1) {
@@ -452,7 +453,7 @@ nv30_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nv30_draw_elements(struct nv30_context *nv30, boolean shorten,
+nv30_draw_elements(struct nv30_context *nv30, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -461,13 +462,11 @@ nv30_draw_elements(struct nv30_context *nv30, boolean shorten,
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    unsigned prim = nv30_prim_gl(mode);
 
-#if 0 /*XXX*/
-   if (index_bias != nv30->state.index_bias) {
-      BEGIN_NV04(push, NV30_3D(VB_ELEMENT_BASE), 1);
+   if (eng3d->oclass >= NV40_3D_CLASS && index_bias != nv30->state.index_bias) {
+      BEGIN_NV04(push, NV40_3D(VB_ELEMENT_BASE), 1);
       PUSH_DATA (push, index_bias);
       nv30->state.index_bias = index_bias;
    }
-#endif
 
    if (eng3d->oclass == NV40_3D_CLASS && index_size > 1 &&
        nv30->idxbuf.buffer) {
@@ -564,7 +563,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv30->vbo_user && !(nv30->dirty & (NV30_NEW_VERTEX | NV30_NEW_ARRAYS)))
       nv30_update_user_vbufs(nv30);
 
-   nv30_state_validate(nv30, ~0, TRUE);
+   nv30_state_validate(nv30, ~0, true);
    if (nv30->draw_flags) {
       nv30_render_vbo(pipe, info);
       return;
@@ -578,17 +577,17 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nv30->vtxbuf[i].buffer)
          continue;
       if (nv30->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv30->base.vbo_dirty = TRUE;
+         nv30->base.vbo_dirty = true;
    }
 
    if (!nv30->base.vbo_dirty && nv30->idxbuf.buffer &&
        nv30->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nv30->base.vbo_dirty = TRUE;
+      nv30->base.vbo_dirty = true;
 
    if (nv30->base.vbo_dirty) {
       BEGIN_NV04(push, NV30_3D(VTX_CACHE_INVALIDATE_1710), 1);
       PUSH_DATA (push, 0);
-      nv30->base.vbo_dirty = FALSE;
+      nv30->base.vbo_dirty = false;
    }
 
    if (!info->indexed) {
@@ -596,7 +595,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
                        info->mode, info->start, info->count,
                        info->instance_count);
    } else {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart != nv30->state.prim_restart) {
          if (info->primitive_restart) {
@@ -605,7 +604,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             PUSH_DATA (push, info->restart_index);
 
             if (info->restart_index > 65535)
-               shorten = FALSE;
+               shorten = false;
          } else {
             BEGIN_NV04(push, NV40_3D(PRIM_RESTART_ENABLE), 1);
             PUSH_DATA (push, 0);
@@ -617,7 +616,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
          PUSH_DATA (push, info->restart_index);
 
          if (info->restart_index > 65535)
-            shorten = FALSE;
+            shorten = false;
       }
 
       nv30_draw_elements(nv30, shorten,
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
index 4d4145d10b5..ee0a6280d7a 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
@@ -48,7 +48,7 @@ nv30_vertprog_destroy(struct nv30_vertprog *vp)
    vp->consts = NULL;
    vp->nr_consts = 0;
 
-   vp->translated = FALSE;
+   vp->translated = false;
 }
 
 void
@@ -58,8 +58,8 @@ nv30_vertprog_validate(struct nv30_context *nv30)
    struct nouveau_object *eng3d = nv30->screen->eng3d;
    struct nv30_vertprog *vp = nv30->vertprog.program;
    struct nv30_fragprog *fp = nv30->fragprog.program;
-   boolean upload_code = FALSE;
-   boolean upload_data = FALSE;
+   bool upload_code = false;
+   bool upload_data = false;
    unsigned i;
 
    if (nv30->dirty & NV30_NEW_FRAGPROG) {
@@ -125,7 +125,7 @@ nv30_vertprog_validate(struct nv30_context *nv30)
          }
       }
 
-      upload_code = TRUE;
+      upload_code = true;
    }
 
    if (vp->nr_consts && !vp->data) {
@@ -166,8 +166,8 @@ nv30_vertprog_validate(struct nv30_context *nv30)
          }
       }
 
-      upload_code = TRUE;
-      upload_data = TRUE;
+      upload_code = true;
+      upload_data = true;
    }
 
    if (vp->nr_consts) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
index 5cee5df60ce..2324b517c44 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
@@ -19,34 +19,34 @@
 #define NV40_3D_PRIM_RESTART_ENABLE 0x1dac
 #define NV40_3D_PRIM_RESTART_INDEX  0x1db0
 
-static INLINE void
+static inline void
 PUSH_RELOC(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t offset,
       uint32_t flags, uint32_t vor, uint32_t tor)
 {
    nouveau_pushbuf_reloc(push, bo, offset, flags, vor, tor);
 }
 
-static INLINE struct nouveau_bufctx *
+static inline struct nouveau_bufctx *
 bufctx(struct nouveau_pushbuf *push)
 {
    struct nouveau_bufctx **pctx = push->user_priv;
    return *pctx;
 }
 
-static INLINE void
+static inline void
 PUSH_RESET(struct nouveau_pushbuf *push, int bin)
 {
    nouveau_bufctx_reset(bufctx(push), bin);
 }
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, int bin,
      struct nouveau_bo *bo, uint32_t access)
 {
    nouveau_bufctx_refn(bufctx(push), bin, bo, access);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t offset, uint32_t access)
 {
@@ -55,7 +55,7 @@ PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
    PUSH_DATA(push, bo->offset + offset);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t access, uint32_t vor, uint32_t tor)
 {
@@ -67,7 +67,7 @@ PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       PUSH_DATA(push, tor);
 }
 
-static INLINE void
+static inline void
 PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       struct nouveau_bo *bo, uint32_t data, uint32_t access,
       uint32_t vor, uint32_t tor)
@@ -80,7 +80,7 @@ PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
       PUSH_DATA(push, data | tor);
 }
 
-static INLINE struct nouveau_bufref *
+static inline struct nouveau_bufref *
 PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
      struct nouveau_bo *bo, uint32_t data, uint32_t access,
      uint32_t vor, uint32_t tor)
@@ -99,7 +99,7 @@ PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
    return bref;
 }
 
-static INLINE void
+static inline void
 PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
            struct nv04_resource *r, uint32_t data, uint32_t access,
            uint32_t vor, uint32_t tor)
@@ -108,14 +108,14 @@ PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
              r->domain | access, vor, tor)->priv = r;
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
 {
    PUSH_SPACE(push, size + 1);
    PUSH_DATA (push, 0x00000000 | (size << 18) | (subc << 13) | mthd);
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
 {
    PUSH_SPACE(push, size + 1);
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
index 9ef16965f39..e68d23e5587 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -44,7 +44,7 @@ struct nvfx_fpc {
    struct util_dynarray label_relocs;
 };
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 temp(struct nvfx_fpc *fpc)
 {
    int idx = __builtin_ctzll(~fpc->r_temps);
@@ -60,7 +60,7 @@ temp(struct nvfx_fpc *fpc)
    return nvfx_reg(NVFXSR_TEMP, idx);
 }
 
-static INLINE void
+static inline void
 release_temps(struct nvfx_fpc *fpc)
 {
    fpc->r_temps &= ~fpc->r_temps_discard;
@@ -373,7 +373,7 @@ nv40_fp_brk(struct nvfx_fpc *fpc)
    hw[3] = 0;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 {
    struct nvfx_src src;
@@ -415,7 +415,7 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
    return src;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
    switch (fdst->Register.File) {
    case TGSI_FILE_OUTPUT:
@@ -430,7 +430,7 @@ tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
    }
 }
 
-static INLINE int
+static inline int
 tgsi_mask(uint tgsi)
 {
    int mask = 0;
@@ -442,7 +442,7 @@ tgsi_mask(uint tgsi)
    return mask;
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
             const struct tgsi_full_instruction *finst)
 {
@@ -455,7 +455,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
    int i;
 
    if (finst->Instruction.Opcode == TGSI_OPCODE_END)
-      return TRUE;
+      return true;
 
    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
       const struct tgsi_full_src_register *fsrc;
@@ -525,7 +525,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
          break;
       default:
          NOUVEAU_ERR("bad src file\n");
-         return FALSE;
+         return false;
       }
    }
 
@@ -868,12 +868,12 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
 
         default:
       NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
-      return FALSE;
+      return false;
    }
 
 out:
    release_temps(fpc);
-   return TRUE;
+   return true;
 nv3x_cflow:
    {
       static int warned = 0;
@@ -887,7 +887,7 @@ nv3x_cflow:
    goto out;
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
                                const struct tgsi_full_declaration *fdec)
 {
@@ -917,17 +917,17 @@ nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
    case TGSI_SEMANTIC_GENERIC:
    case TGSI_SEMANTIC_PCOORD:
       /* will be assigned to remaining TC slots later */
-      return TRUE;
+      return true;
    default:
       assert(0);
-      return FALSE;
+      return false;
    }
 
    fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
                              const struct tgsi_full_declaration *fdec)
 {
@@ -954,16 +954,16 @@ nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
             }
             hw = NVFX_FP_OP_INPUT_SRC_TC(hw);
             fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
-            return TRUE;
+            return true;
          }
       }
-      return FALSE;
+      return false;
    default:
-      return TRUE;
+      return true;
    }
 }
 
-static boolean
+static bool
 nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
             const struct tgsi_full_declaration *fdec)
 {
@@ -984,20 +984,20 @@ nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
       }
       if(hw > ((fpc->is_nv4x) ? 4 : 2)) {
          NOUVEAU_ERR("bad rcol index\n");
-         return FALSE;
+         return false;
       }
       break;
    default:
       NOUVEAU_ERR("bad output semantic\n");
-      return FALSE;
+      return false;
    }
 
    fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
    fpc->r_temps |= (1ULL << hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
 {
    struct tgsi_parse_context p;
@@ -1081,17 +1081,17 @@ nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
       fpc->r_temps_discard = 0ULL;
    }
 
-   return TRUE;
+   return true;
 
 out_err:
    FREE(fpc->r_temp);
    fpc->r_temp = NULL;
 
    tgsi_parse_free(&p);
-   return FALSE;
+   return false;
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", false)
 
 void
 _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
@@ -1100,7 +1100,7 @@ _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
    struct nvfx_fpc *fpc = NULL;
    struct util_dynarray insns;
 
-   fp->translated = FALSE;
+   fp->translated = false;
    fp->point_sprite_control = 0;
    fp->vp_or = 0;
 
@@ -1182,7 +1182,7 @@ _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
       debug_printf("\n");
    }
 
-   fp->translated = TRUE;
+   fp->translated = true;
 
 out:
    tgsi_parse_free(&parse);
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
index 9538a793d7e..e66d8af7620 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
@@ -448,8 +448,8 @@ struct nvfx_insn
 	struct nvfx_src src[3];
 };
 
-static INLINE struct nvfx_insn
-nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
+static inline struct nvfx_insn
+nvfx_insn(bool sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
 {
 	struct nvfx_insn insn = {
 		.op = op,
@@ -468,7 +468,7 @@ nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask
 	return insn;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 nvfx_reg(int type, int index)
 {
 	struct nvfx_reg temp = {
@@ -478,7 +478,7 @@ nvfx_reg(int type, int index)
 	return temp;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src(struct nvfx_reg reg)
 {
 	struct nvfx_src temp = {
@@ -491,7 +491,7 @@ nvfx_src(struct nvfx_reg reg)
 	return temp;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
 {
 	struct nvfx_src dst = src;
@@ -503,14 +503,14 @@ nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
 	return dst;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_neg(struct nvfx_src src)
 {
 	src.negate = !src.negate;
 	return src;
 }
 
-static INLINE struct nvfx_src
+static inline struct nvfx_src
 nvfx_src_abs(struct nvfx_src src)
 {
 	src.abs = 1;
@@ -529,7 +529,7 @@ struct nv30_vertprog;
 void
 _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp);
 
-boolean
+bool
 _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp);
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index 1ce0589be71..5757eb1fb16 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -416,7 +416,7 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
    return src;
 }
 
-static INLINE struct nvfx_reg
+static inline struct nvfx_reg
 tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
    struct nvfx_reg dst;
 
@@ -455,7 +455,7 @@ tgsi_mask(uint tgsi)
    return mask;
 }
 
-static boolean
+static bool
 nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
             unsigned idx, const struct tgsi_full_instruction *finst)
 {
@@ -466,7 +466,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
    struct nvfx_insn insn;
    struct nvfx_relocation reloc;
    struct nvfx_loop_entry loop;
-   boolean sat = FALSE;
+   bool sat = false;
    int mask;
    int ai = -1, ci = -1, ii = -1;
    int i;
@@ -524,25 +524,25 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
          break;
       default:
          NOUVEAU_ERR("bad src file\n");
-         return FALSE;
+         return false;
       }
    }
 
    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
       if(src[i].reg.type < 0)
-         return FALSE;
+         return false;
    }
 
    if(finst->Dst[0].Register.File == TGSI_FILE_ADDRESS &&
       finst->Instruction.Opcode != TGSI_OPCODE_ARL)
-      return FALSE;
+      return false;
 
    final_dst = dst  = tgsi_dst(vpc, &finst->Dst[0]);
    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
    if(finst->Instruction.Saturate) {
       assert(finst->Instruction.Opcode != TGSI_OPCODE_ARL);
       if (vpc->is_nv4x)
-         sat = TRUE;
+         sat = true;
       else
       if(dst.type != NVFXSR_TEMP)
          dst = temp(vpc);
@@ -793,7 +793,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
       break;
    default:
       NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
-      return FALSE;
+      return false;
    }
 
    if(finst->Instruction.Saturate && !vpc->is_nv4x) {
@@ -804,10 +804,10 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
    }
 
    release_temps(vpc);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
                                 const struct tgsi_full_declaration *fdec)
 {
@@ -825,7 +825,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
       vpc->r_result[idx] = temp(vpc);
       vpc->r_temps_discard = 0;
       vpc->cvtx_idx = idx;
-      return TRUE;
+      return true;
    case TGSI_SEMANTIC_COLOR:
       if (fdec->Semantic.Index == 0) {
          hw = NVFX_VP(INST_DEST_COL0);
@@ -834,7 +834,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
          hw = NVFX_VP(INST_DEST_COL1);
       } else {
          NOUVEAU_ERR("bad colour semantic index\n");
-         return FALSE;
+         return false;
       }
       break;
    case TGSI_SEMANTIC_BCOLOR:
@@ -845,7 +845,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
          hw = NVFX_VP(INST_DEST_BFC1);
       } else {
          NOUVEAU_ERR("bad bcolour semantic index\n");
-         return FALSE;
+         return false;
       }
       break;
    case TGSI_SEMANTIC_FOG:
@@ -868,22 +868,22 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
 
       if (i == num_texcoords) {
          vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
-         return TRUE;
+         return true;
       }
       break;
    case TGSI_SEMANTIC_EDGEFLAG:
       vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
-      return TRUE;
+      return true;
    default:
       NOUVEAU_ERR("bad output semantic\n");
-      return FALSE;
+      return false;
    }
 
    vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
-   return TRUE;
+   return true;
 }
 
-static boolean
+static bool
 nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
 {
    struct tgsi_parse_context p;
@@ -924,7 +924,7 @@ nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
             break;
          case TGSI_FILE_OUTPUT:
             if (!nvfx_vertprog_parse_decl_output(vpc, fdec))
-               return FALSE;
+               return false;
             break;
          default:
             break;
@@ -961,12 +961,12 @@ nvfx_vertprog_prepare(struct nvfx_vpc *vpc)
    }
 
    vpc->r_temps_discard = 0;
-   return TRUE;
+   return true;
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", false)
 
-boolean
+bool
 _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
 {
    struct tgsi_parse_context parse;
@@ -975,13 +975,13 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
    struct util_dynarray insns;
    int i, ucps;
 
-   vp->translated = FALSE;
+   vp->translated = false;
    vp->nr_insns = 0;
    vp->nr_consts = 0;
 
    vpc = CALLOC_STRUCT(nvfx_vpc);
    if (!vpc)
-      return FALSE;
+      return false;
    vpc->is_nv4x = (oclass >= NV40_3D_CLASS) ? ~0 : 0;
    vpc->vp   = vp;
    vpc->pipe = vp->pipe;
@@ -990,7 +990,7 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
 
    if (!nvfx_vertprog_prepare(vpc)) {
       FREE(vpc);
-      return FALSE;
+      return false;
    }
 
    /* Redirect post-transform vertex position to a temp if user clip
@@ -1108,7 +1108,7 @@ _nvfx_vertprog_translate(uint16_t oclass, struct nv30_vertprog *vp)
       debug_printf("\n");
    }
 
-   vp->translated = TRUE;
+   vp->translated = true;
 
 out:
    tgsi_parse_free(&parse);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_blit.h b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
index 756c4c11bf6..0ccec568d3a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_blit.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
@@ -37,7 +37,7 @@ nv50_resource_resolve(struct pipe_context *, const struct pipe_resolve_info *);
 #define NV50_BLIT_TEXTURE_2D_ARRAY  5
 #define NV50_BLIT_MAX_TEXTURE_TYPES 6
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_texture_type(enum pipe_texture_target target)
 {
    switch (target) {
@@ -52,7 +52,7 @@ nv50_blit_texture_type(enum pipe_texture_target target)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target)
 {
    switch (target) {
@@ -67,7 +67,7 @@ nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target)
    }
 }
 
-static INLINE enum pipe_texture_target
+static inline enum pipe_texture_target
 nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target)
 {
    switch (target) {
@@ -81,7 +81,7 @@ nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 nv50_blit_get_filter(const struct pipe_blit_info *info)
 {
    if (info->dst.resource->nr_samples < info->src.resource->nr_samples)
@@ -102,7 +102,7 @@ nv50_blit_get_filter(const struct pipe_blit_info *info)
 /* Since shaders cannot export stencil, we cannot copy stencil values when
  * rendering to ZETA, so we attach the ZS surface to a colour render target.
  */
-static INLINE enum pipe_format
+static inline enum pipe_format
 nv50_blit_zeta_to_colour_format(enum pipe_format format)
 {
    switch (format) {
@@ -127,7 +127,7 @@ nv50_blit_zeta_to_colour_format(enum pipe_format format)
 }
 
 
-static INLINE uint16_t
+static inline uint16_t
 nv50_blit_derive_color_mask(const struct pipe_blit_info *info)
 {
    const unsigned mask = info->mask;
@@ -162,7 +162,7 @@ nv50_blit_derive_color_mask(const struct pipe_blit_info *info)
    return color_mask;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info)
 {
    uint32_t mask = 0;
@@ -191,8 +191,8 @@ nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info)
 # define nv50_format_table nvc0_format_table
 #endif
 
-/* return TRUE for formats that can be converted among each other by NVC0_2D */
-static INLINE boolean
+/* return true for formats that can be converted among each other by NVC0_2D */
+static inline bool
 nv50_2d_dst_format_faithful(enum pipe_format format)
 {
    const uint64_t mask =
@@ -201,7 +201,7 @@ nv50_2d_dst_format_faithful(enum pipe_format format)
    uint8_t id = nv50_format_table[format].rt;
    return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
 }
-static INLINE boolean
+static inline bool
 nv50_2d_src_format_faithful(enum pipe_format format)
 {
    const uint64_t mask =
@@ -211,7 +211,7 @@ nv50_2d_src_format_faithful(enum pipe_format format)
    return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
 }
 
-static INLINE boolean
+static inline bool
 nv50_2d_format_supported(enum pipe_format format)
 {
    uint8_t id = nv50_format_table[format].rt;
@@ -219,7 +219,7 @@ nv50_2d_format_supported(enum pipe_format format)
       (NV50_ENG2D_SUPPORTED_FORMATS & (1ULL << (id - 0xc0)));
 }
 
-static INLINE boolean
+static inline bool
 nv50_2d_dst_format_ops_supported(enum pipe_format format)
 {
    uint8_t id = nv50_format_table[format].rt;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 5b5d3912c20..f8d46db7c67 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -64,12 +64,12 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nv50->vtxbuf[i].buffer)
             continue;
          if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nv50->base.vbo_dirty = TRUE;
+            nv50->base.vbo_dirty = true;
       }
 
       if (nv50->idxbuf.buffer &&
           nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nv50->base.vbo_dirty = TRUE;
+         nv50->base.vbo_dirty = true;
 
       for (s = 0; s < 3 && !nv50->cb_dirty; ++s) {
          uint32_t valid = nv50->constbuf_valid[s];
@@ -87,7 +87,7 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags)
                continue;
 
             if (res->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-               nv50->cb_dirty = TRUE;
+               nv50->cb_dirty = true;
          }
       }
    }
@@ -100,9 +100,9 @@ nv50_default_kick_notify(struct nouveau_pushbuf *push)
 
    if (screen) {
       nouveau_fence_next(&screen->base);
-      nouveau_fence_update(&screen->base, TRUE);
+      nouveau_fence_update(&screen->base, true);
       if (screen->cur_ctx)
-         screen->cur_ctx->state.flushed = TRUE;
+         screen->cur_ctx->state.flushed = true;
    }
 }
 
@@ -310,7 +310,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv)
    nv50->base.invalidate_resource_storage = nv50_invalidate_resource_storage;
 
    if (screen->base.device->chipset < 0x84 ||
-       debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) {
+       debug_get_bool_option("NOUVEAU_PMPEG", false)) {
       /* PMPEG */
       nouveau_context_init_vdec(&nv50->base);
    } else if (screen->base.device->chipset < 0x98 ||
@@ -351,7 +351,7 @@ out_err:
 }
 
 void
-nv50_bufctx_fence(struct nouveau_bufctx *bufctx, boolean on_flush)
+nv50_bufctx_fence(struct nouveau_bufctx *bufctx, bool on_flush)
 {
    struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending;
    struct nouveau_list *it;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 1f123ef7e92..ce12e714774 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -91,7 +91,7 @@
 
 struct nv50_blitctx;
 
-boolean nv50_blitctx_create(struct nv50_context *);
+bool nv50_blitctx_create(struct nv50_context *);
 
 struct nv50_context {
    struct nouveau_context base;
@@ -102,7 +102,7 @@ struct nv50_context {
    struct nouveau_bufctx *bufctx;
 
    uint32_t dirty;
-   boolean cb_dirty;
+   bool cb_dirty;
 
    struct nv50_graph_state state;
 
@@ -152,26 +152,26 @@ struct nv50_context {
    unsigned sample_mask;
    unsigned min_samples;
 
-   boolean vbo_push_hint;
+   bool vbo_push_hint;
 
    uint32_t rt_array_mode;
 
    struct pipe_query *cond_query;
-   boolean cond_cond; /* inverted rendering condition */
+   bool cond_cond; /* inverted rendering condition */
    uint cond_mode;
    uint32_t cond_condmode; /* the calculated condition */
 
    struct nv50_blitctx *blit;
 };
 
-static INLINE struct nv50_context *
+static inline struct nv50_context *
 nv50_context(struct pipe_context *pipe)
 {
    return (struct nv50_context *)pipe;
 }
 
 /* return index used in nv50_context arrays for a specific shader type */
-static INLINE unsigned
+static inline unsigned
 nv50_context_shader_stage(unsigned pipe)
 {
    switch (pipe) {
@@ -188,7 +188,7 @@ nv50_context_shader_stage(unsigned pipe)
 /* nv50_context.c */
 struct pipe_context *nv50_create(struct pipe_screen *, void *);
 
-void nv50_bufctx_fence(struct nouveau_bufctx *, boolean on_flush);
+void nv50_bufctx_fence(struct nouveau_bufctx *, bool on_flush);
 
 void nv50_default_kick_notify(struct nouveau_pushbuf *);
 
@@ -202,7 +202,7 @@ void nv50_query_pushbuf_submit(struct nouveau_pushbuf *,
 void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
 void nva0_so_target_save_offset(struct pipe_context *,
                                 struct pipe_stream_output_target *,
-                                unsigned index, boolean seralize);
+                                unsigned index, bool seralize);
 
 #define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
@@ -221,8 +221,8 @@ extern void nv50_init_state_functions(struct nv50_context *);
 
 /* nv50_state_validate.c */
 /* @words: check for space before emitting relocs */
-extern boolean nv50_state_validate(struct nv50_context *, uint32_t state_mask,
-                                   unsigned space_words);
+extern bool nv50_state_validate(struct nv50_context *, uint32_t state_mask,
+                                unsigned space_words);
 
 /* nv50_surface.c */
 extern void nv50_clear(struct pipe_context *, unsigned buffers,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
index 0f86ba1de0d..49a93bf1d91 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -44,7 +44,7 @@
  */
 #define U_V   PIPE_BIND_VERTEX_BUFFER
 #define U_T   PIPE_BIND_SAMPLER_VIEW
-#define U_I   PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE
+#define U_I   PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE | PIPE_BIND_COMPUTE_RESOURCE
 #define U_TR  PIPE_BIND_RENDER_TARGET | U_T
 #define U_IR  U_TR | U_I
 #define U_TB  PIPE_BIND_BLENDABLE | U_TR
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
index f15d8f3ecb6..92d49e49ff2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -30,7 +30,7 @@
 
 uint32_t
 nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
-                                 boolean is_3d)
+                                 bool is_3d)
 {
    uint32_t tile_mode = 0x000;
 
@@ -59,13 +59,13 @@ nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
 }
 
 static uint32_t
-nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, boolean is_3d)
+nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d)
 {
    return nv50_tex_choose_tile_dims_helper(nx, ny * 2, nz, is_3d);
 }
 
 static uint32_t
-nv50_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
+nv50_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
 {
    const unsigned ms = util_logbase2(mt->base.base.nr_samples);
    uint32_t tile_flags;
@@ -184,7 +184,7 @@ nv50_miptree_get_handle(struct pipe_screen *pscreen,
    unsigned stride;
 
    if (!mt || !mt->base.bo)
-      return FALSE;
+      return false;
 
    stride = mt->level[0].pitch;
 
@@ -204,7 +204,7 @@ const struct u_resource_vtbl nv50_miptree_vtbl =
    u_default_transfer_inline_write  /* transfer_inline_write */
 };
 
-static INLINE boolean
+static inline bool
 nv50_miptree_init_ms_mode(struct nv50_miptree *mt)
 {
    switch (mt->base.base.nr_samples) {
@@ -228,12 +228,12 @@ nv50_miptree_init_ms_mode(struct nv50_miptree *mt)
       break;
    default:
       NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples);
-      return FALSE;
+      return false;
    }
-   return TRUE;
+   return true;
 }
 
-boolean
+bool
 nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
 {
    struct pipe_resource *pt = &mt->base.base;
@@ -241,12 +241,12 @@ nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
    unsigned h = pt->height0;
 
    if (util_format_is_depth_or_stencil(pt->format))
-      return FALSE;
+      return false;
 
    if ((pt->last_level > 0) || (pt->depth0 > 1) || (pt->array_size > 1))
-      return FALSE;
+      return false;
    if (mt->ms_x | mt->ms_y)
-      return FALSE;
+      return false;
 
    mt->level[0].pitch = align(pt->width0 * blocksize, pitch_align);
 
@@ -256,7 +256,7 @@ nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
 
    mt->total_size = mt->level[0].pitch * h;
 
-   return TRUE;
+   return true;
 }
 
 static void
@@ -335,7 +335,7 @@ nv50_miptree_create(struct pipe_screen *pscreen,
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   boolean compressed = dev->drm_version >= 0x01000101;
+   bool compressed = dev->drm_version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
@@ -438,7 +438,7 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen,
 
 
 /* Offset of zslice @z from start of level @l. */
-INLINE unsigned
+inline unsigned
 nv50_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z)
 {
    const struct pipe_resource *pt = &mt->base.base;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index aaca4c550d9..02dc3677259 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -25,7 +25,7 @@
 
 #include "codegen/nv50_ir_driver.h"
 
-static INLINE unsigned
+static inline unsigned
 bitcount4(const uint32_t val)
 {
    static const uint8_t cnt[16]
@@ -104,7 +104,7 @@ nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
          prog->vp.bfc[info->out[i].si] = i;
          break;
       case TGSI_SEMANTIC_LAYER:
-         prog->gp.has_layer = TRUE;
+         prog->gp.has_layer = true;
          prog->gp.layerid = n;
          break;
       case TGSI_SEMANTIC_VIEWPORT_INDEX:
@@ -316,7 +316,7 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
    return so;
 }
 
-boolean
+bool
 nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 {
    struct nv50_ir_prog_info *info;
@@ -325,7 +325,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 
    info = CALLOC_STRUCT(nv50_ir_prog_info);
    if (!info)
-      return FALSE;
+      return false;
 
    info->type = prog->type;
    info->target = chipset;
@@ -410,7 +410,7 @@ out:
    return !ret;
 }
 
-boolean
+bool
 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
 {
    struct nouveau_heap *heap;
@@ -423,7 +423,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
    default:
       assert(!"invalid program type");
-      return FALSE;
+      return false;
    }
 
    ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
@@ -440,7 +440,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
       if (ret) {
          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
-         return FALSE;
+         return false;
       }
    }
    prog->code_base = prog->mem->start;
@@ -448,10 +448,10 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    if (ret < 0) {
       nouveau_heap_free(&prog->mem);
-      return FALSE;
+      return false;
    }
    if (ret > 0)
-      nv50->state.new_tls_space = TRUE;
+      nv50->state.new_tls_space = true;
 
    if (prog->fixups)
       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
@@ -463,7 +463,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
    PUSH_DATA (nv50->base.pushbuf, 0);
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index fe6bd6025be..5d3ff5644d2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -53,7 +53,7 @@ struct nv50_program {
    struct pipe_shader_state pipe;
 
    ubyte type;
-   boolean translated;
+   bool translated;
 
    uint32_t *code;
    unsigned code_size;
@@ -104,8 +104,8 @@ struct nv50_program {
    struct nv50_stream_output_state *so;
 };
 
-boolean nv50_program_translate(struct nv50_program *, uint16_t chipset);
-boolean nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
+bool nv50_program_translate(struct nv50_program *, uint16_t chipset);
+bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
 void nv50_program_destroy(struct nv50_context *, struct nv50_program *);
 
 #endif /* __NV50_PROG_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c
index a3a397c52c1..f31eaa0e314 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_push.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c
@@ -23,13 +23,13 @@ struct push_context {
 
    struct translate *translate;
 
-   boolean primitive_restart;
+   bool primitive_restart;
    uint32_t prim;
    uint32_t restart_index;
    uint32_t instance_id;
 };
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -39,7 +39,7 @@ prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -49,7 +49,7 @@ prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
@@ -179,7 +179,7 @@ emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 #define NV50_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -212,7 +212,7 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
    unsigned i, index_size;
    unsigned inst_count = info->instance_count;
    unsigned vert_count = info->count;
-   boolean apply_bias = info->indexed && info->index_bias;
+   bool apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv50->base.pushbuf;
    ctx.translate = nv50->vertex->translate;
@@ -258,12 +258,12 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
             NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
             return;
          }
-         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         pipe->get_query_result(pipe, targ->pq, true, (void *)&vert_count);
          vert_count /= targ->stride;
       }
       ctx.idxbuf = NULL;
       index_size = 0;
-      ctx.primitive_restart = FALSE;
+      ctx.primitive_restart = false;
       ctx.restart_index = 0;
    }
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index 81f7474e36b..f4adbf8c653 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -48,20 +48,21 @@ struct nv50_query {
    uint32_t base;
    uint32_t offset; /* base + i * 32 */
    uint8_t state;
-   boolean is64bit;
+   bool is64bit;
+   int nesting; /* only used for occlusion queries */
    struct nouveau_mm_allocation *mm;
    struct nouveau_fence *fence;
 };
 
 #define NV50_QUERY_ALLOC_SPACE 256
 
-static INLINE struct nv50_query *
+static inline struct nv50_query *
 nv50_query(struct pipe_query *pipe)
 {
    return (struct nv50_query *)pipe;
 }
 
-static boolean
+static bool
 nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
 {
    struct nv50_screen *screen = nv50->screen;
@@ -80,17 +81,17 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
    if (size) {
       q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
       if (!q->bo)
-         return FALSE;
+         return false;
       q->offset = q->base;
 
       ret = nouveau_bo_map(q->bo, 0, screen->base.client);
       if (ret) {
          nv50_query_allocate(nv50, q, 0);
-         return FALSE;
+         return false;
       }
       q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -153,8 +154,8 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    struct nv50_query *q = nv50_query(pq);
 
    /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to FALSE even *after* we re-
-    * initialized it to TRUE.
+    * query might set the initial render conition to false even *after* we re-
+    * initialized it to true.
     */
    if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
       q->offset += 32;
@@ -166,7 +167,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
        *  query ?
        */
       q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = TRUE */
+      q->data[1] = 1; /* initial render condition = true */
       q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
       q->data[5] = 0;
    }
@@ -175,11 +176,16 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
-      PUSH_SPACE(push, 4);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
-      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-      PUSH_DATA (push, 1);
+      q->nesting = nv50->screen->num_occlusion_queries_active++;
+      if (q->nesting) {
+         nv50_query_get(push, q, 0x10, 0x0100f002);
+      } else {
+         PUSH_SPACE(push, 4);
+         BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
+         PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 1);
+      }
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
       nv50_query_get(push, q, 0x10, 0x06805002);
@@ -223,9 +229,11 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
       nv50_query_get(push, q, 0, 0x0100f002);
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-      PUSH_DATA (push, 0);
+      if (--nv50->screen->num_occlusion_queries_active == 0) {
+         PUSH_SPACE(push, 2);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 0);
+      }
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
       nv50_query_get(push, q, 0, 0x06805002);
@@ -261,7 +269,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to FALSE */
+      /* This query is not issued on GPU because disjoint is forced to false */
       q->state = NV50_QUERY_STATE_READY;
       break;
    default:
@@ -273,7 +281,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence);
 }
 
-static INLINE void
+static inline void
 nv50_query_update(struct nv50_query *q)
 {
    if (q->is64bit) {
@@ -293,7 +301,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nv50_query *q = nv50_query(pq);
    uint64_t *res64 = (uint64_t *)result;
    uint32_t *res32 = (uint32_t *)result;
-   boolean *res8 = (boolean *)result;
+   uint8_t *res8 = (uint8_t *)result;
    uint64_t *data64 = (uint64_t *)q->data;
    int i;
 
@@ -307,19 +315,19 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
             q->state = NV50_QUERY_STATE_FLUSHED;
             PUSH_KICK(nv50->base.pushbuf);
          }
-         return FALSE;
+         return false;
       }
       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
-         return FALSE;
+         return false;
    }
    q->state = NV50_QUERY_STATE_READY;
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = TRUE;
+      res8[0] = true;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
-      res64[0] = q->data[1];
+      res64[0] = q->data[1] - q->data[5];
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
    case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
@@ -338,7 +346,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       res64[0] = 1000000000;
-      res8[8] = FALSE;
+      res8[8] = false;
       break;
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
@@ -347,10 +355,10 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       res32[0] = q->data[1];
       break;
    default:
-      return FALSE;
+      return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -377,7 +385,7 @@ nv50_render_condition(struct pipe_context *pipe,
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_query *q;
    uint32_t cond;
-   boolean wait =
+   bool wait =
       mode != PIPE_RENDER_COND_NO_WAIT &&
       mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
 
@@ -391,13 +399,12 @@ nv50_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
          cond = condition ? NV50_3D_COND_MODE_EQUAL :
                             NV50_3D_COND_MODE_NOT_EQUAL;
-         wait = TRUE;
+         wait = true;
          break;
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
          if (likely(!condition)) {
-            /* XXX: Placeholder, handle nesting here if available */
-            if (unlikely(false))
+            if (unlikely(q->nesting))
                cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
                              NV50_3D_COND_MODE_ALWAYS;
             else
@@ -461,7 +468,7 @@ nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
 void
 nva0_so_target_save_offset(struct pipe_context *pipe,
                            struct pipe_stream_output_target *ptarg,
-                           unsigned index, boolean serialize)
+                           unsigned index, bool serialize)
 {
    struct nv50_so_target *targ = nv50_so_target(ptarg);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
index f7ee1354a92..a46e622c597 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -35,7 +35,7 @@ nv50_screen_init_resource_functions(struct pipe_screen *pscreen);
 
 uint32_t
 nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz,
-                                 boolean is_3d);
+                                 bool is_3d);
 
 struct nv50_miptree_level {
    uint32_t offset;
@@ -50,13 +50,13 @@ struct nv50_miptree {
    struct nv50_miptree_level level[NV50_MAX_TEXTURE_LEVELS];
    uint32_t total_size;
    uint32_t layer_stride;
-   boolean layout_3d; /* TRUE if layer count varies with mip level */
+   bool layout_3d; /* true if layer count varies with mip level */
    uint8_t ms_x;      /* log2 of number of samples in x/y dimension */
    uint8_t ms_y;
    uint8_t ms_mode;
 };
 
-static INLINE struct nv50_miptree *
+static inline struct nv50_miptree *
 nv50_miptree(struct pipe_resource *pt)
 {
    return (struct nv50_miptree *)pt;
@@ -70,7 +70,7 @@ nv50_miptree(struct pipe_resource *pt)
 
 /* Internal functions:
  */
-boolean
+bool
 nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align);
 
 struct pipe_resource *
@@ -98,13 +98,13 @@ struct nv50_surface {
    uint16_t depth;
 };
 
-static INLINE struct nv50_surface *
+static inline struct nv50_surface *
 nv50_surface(struct pipe_surface *ps)
 {
    return (struct nv50_surface *)ps;
 }
 
-static INLINE enum pipe_format
+static inline enum pipe_format
 nv50_zs_to_s_format(enum pipe_format format)
 {
    switch (format) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 6583a353578..30e6e042fbf 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -51,19 +51,19 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 8)
-      return FALSE;
+      return false;
    if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
-      return FALSE;
+      return false;
    if (sample_count == 8 && util_format_get_blocksizebits(format) >= 128)
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings))
-      return FALSE;
+      return false;
 
    switch (format) {
    case PIPE_FORMAT_Z16_UNORM:
       if (nv50_screen(pscreen)->tesla->oclass < NVA0_3D_CLASS)
-         return FALSE;
+         return false;
       break;
    default:
       break;
@@ -176,6 +176,9 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -210,6 +213,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -286,7 +290,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       /* The chip could handle more sampler views than samplers */
    case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
-      return MIN2(32, PIPE_MAX_SAMPLERS);
+      return MIN2(16, PIPE_MAX_SAMPLERS);
    case PIPE_SHADER_CAP_DOUBLES:
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
@@ -454,7 +458,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
    BEGIN_NV04(push, NV50_3D(UNK1400_LANES), 1);
    PUSH_DATA (push, 0xf);
 
-   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) {
+   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", true)) {
       BEGIN_NV04(push, NV50_3D(WATCHDOG_TIMER), 1);
       PUSH_DATA (push, 0x18);
    }
@@ -734,7 +738,7 @@ nv50_screen_create(struct nouveau_device *dev)
    nv50_screen_init_resource_functions(pscreen);
 
    if (screen->base.device->chipset < 0x84 ||
-       debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) {
+       debug_get_bool_option("NOUVEAU_PMPEG", false)) {
       /* PMPEG */
       nouveau_screen_init_vdec(&screen->base);
    } else if (screen->base.device->chipset < 0x98 ||
@@ -890,7 +894,7 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nv50_screen_init_hwctx(screen);
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index 881051b1862..ce51f0fc254 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -32,14 +32,14 @@ struct nv50_graph_state {
    uint32_t semantic_color;
    uint32_t semantic_psize;
    int32_t index_bias;
-   boolean uniform_buffer_bound[3];
-   boolean prim_restart;
-   boolean point_sprite;
-   boolean rt_serialize;
-   boolean flushed;
-   boolean rasterizer_discard;
+   bool uniform_buffer_bound[3];
+   bool prim_restart;
+   bool point_sprite;
+   bool rt_serialize;
+   bool flushed;
+   bool rasterizer_discard;
    uint8_t tls_required;
-   boolean new_tls_space;
+   bool new_tls_space;
    uint8_t num_vtxbufs;
    uint8_t num_vtxelts;
    uint8_t num_textures[3];
@@ -54,6 +54,8 @@ struct nv50_screen {
    struct nv50_context *cur_ctx;
    struct nv50_graph_state save_state;
 
+   int num_occlusion_queries_active;
+
    struct nouveau_bo *code;
    struct nouveau_bo *uniforms;
    struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
@@ -95,19 +97,19 @@ struct nv50_screen {
    struct nouveau_object *m2mf;
 };
 
-static INLINE struct nv50_screen *
+static inline struct nv50_screen *
 nv50_screen(struct pipe_screen *screen)
 {
    return (struct nv50_screen *)screen;
 }
 
-boolean nv50_blitter_create(struct nv50_screen *);
+bool nv50_blitter_create(struct nv50_screen *);
 void nv50_blitter_destroy(struct nv50_screen *);
 
 int nv50_screen_tic_alloc(struct nv50_screen *, void *);
 int nv50_screen_tsc_alloc(struct nv50_screen *, void *);
 
-static INLINE void
+static inline void
 nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
    struct nv50_screen *screen = nv50_screen(res->base.screen);
@@ -119,7 +121,7 @@ nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
    }
 }
 
-static INLINE void
+static inline void
 nv50_resource_validate(struct nv04_resource *res, uint32_t flags)
 {
    if (likely(res->bo)) {
@@ -142,21 +144,21 @@ struct nv50_format {
 
 extern const struct nv50_format nv50_format_table[];
 
-static INLINE void
+static inline void
 nv50_screen_tic_unlock(struct nv50_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0)
       screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
 }
 
-static INLINE void
+static inline void
 nv50_screen_tsc_unlock(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0)
       screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
 }
 
-static INLINE void
+static inline void
 nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0) {
@@ -165,7 +167,7 @@ nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic)
    }
 }
 
-static INLINE void
+static inline void
 nv50_screen_tsc_free(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index c698782d8bd..b033ce5c6dc 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -60,7 +60,7 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                continue;
             }
             if (!nv50->state.uniform_buffer_bound[s]) {
-               nv50->state.uniform_buffer_bound[s] = TRUE;
+               nv50->state.uniform_buffer_bound[s] = true;
                BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
             }
@@ -99,33 +99,35 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
 
                BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD);
+
+               nv50->cb_dirty = 1; /* Force cache flush for UBO. */
             } else {
                BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
                PUSH_DATA (push, (i << 8) | p | 0);
             }
             if (i == 0)
-               nv50->state.uniform_buffer_bound[s] = FALSE;
+               nv50->state.uniform_buffer_bound[s] = false;
          }
       }
    }
 }
 
-static boolean
+static bool
 nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
 {
    if (!prog->translated) {
       prog->translated = nv50_program_translate(
          prog, nv50->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    } else
    if (prog->mem)
-      return TRUE;
+      return true;
 
    return nv50_program_upload_code(nv50, prog);
 }
 
-static INLINE void
+static inline void
 nv50_program_update_context_state(struct nv50_context *nv50,
                                   struct nv50_program *prog, int stage)
 {
@@ -136,7 +138,7 @@ nv50_program_update_context_state(struct nv50_context *nv50,
          nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
       if (!nv50->state.tls_required || nv50->state.new_tls_space)
          BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
-      nv50->state.new_tls_space = FALSE;
+      nv50->state.new_tls_space = false;
       nv50->state.tls_required |= 1 << stage;
    } else {
       if (nv50->state.tls_required == (1 << stage))
@@ -243,11 +245,11 @@ nv50_sprite_coords_validate(struct nv50_context *nv50)
          for (i = 0; i < 8; ++i)
             PUSH_DATA(push, 0);
 
-         nv50->state.point_sprite = FALSE;
+         nv50->state.point_sprite = false;
       }
       return;
    } else {
-      nv50->state.point_sprite = TRUE;
+      nv50->state.point_sprite = true;
    }
 
    memset(pntc, 0, sizeof(pntc));
@@ -646,7 +648,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
             nv50_query_pushbuf_submit(push, targ->pq, 0x4);
          } else {
             PUSH_DATA(push, 0);
-            targ->clean = FALSE;
+            targ->clean = false;
          }
       } else {
          const unsigned limit = targ->pipe.buffer_size /
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index d4d41af3c61..9505a0b4085 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -62,7 +62,7 @@
  *     in advance to maintain elegant separate shader objects.)
  */
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_colormask(unsigned mask)
 {
    uint32_t ret = 0;
@@ -82,7 +82,7 @@ nv50_colormask(unsigned mask)
 #define NV50_BLEND_FACTOR_CASE(a, b) \
    case PIPE_BLENDFACTOR_##a: return NV50_BLEND_FACTOR_##b
 
-static INLINE uint32_t
+static inline uint32_t
 nv50_blend_fac(unsigned factor)
 {
    switch (factor) {
@@ -116,7 +116,7 @@ nv50_blend_state_create(struct pipe_context *pipe,
 {
    struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj);
    int i;
-   boolean emit_common_func = cso->rt[0].blend_enable;
+   bool emit_common_func = cso->rt[0].blend_enable;
    uint32_t ms;
 
    if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
@@ -137,11 +137,11 @@ nv50_blend_state_create(struct pipe_context *pipe,
       for (i = 0; i < 8; ++i) {
          SB_DATA(so, cso->rt[i].blend_enable);
          if (cso->rt[i].blend_enable)
-            emit_common_func = TRUE;
+            emit_common_func = true;
       }
 
       if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
-         emit_common_func = FALSE;
+         emit_common_func = false;
 
          for (i = 0; i < 8; ++i) {
             if (!cso->rt[i].blend_enable)
@@ -373,6 +373,16 @@ nv50_zsa_state_create(struct pipe_context *pipe,
       SB_DATA    (so, 0);
    }
 
+   SB_BEGIN_3D(so, DEPTH_BOUNDS_EN, 1);
+   if (cso->depth.bounds_test) {
+      SB_DATA    (so, 1);
+      SB_BEGIN_3D(so, DEPTH_BOUNDS(0), 2);
+      SB_DATA    (so, fui(cso->depth.bounds_min));
+      SB_DATA    (so, fui(cso->depth.bounds_max));
+   } else {
+      SB_DATA    (so, 0);
+   }
+
    if (cso->stencil[0].enabled) {
       SB_BEGIN_3D(so, STENCIL_ENABLE, 5);
       SB_DATA    (so, 1);
@@ -439,7 +449,7 @@ nv50_zsa_state_delete(struct pipe_context *pipe, void *hwcso)
 #define NV50_TSC_WRAP_CASE(n) \
     case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_tsc_wrap_mode(unsigned wrap)
 {
    switch (wrap) {
@@ -572,7 +582,7 @@ nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
    FREE(hwcso);
 }
 
-static INLINE void
+static inline void
 nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
                                unsigned nr, void **hwcso)
 {
@@ -650,7 +660,7 @@ nv50_sampler_view_destroy(struct pipe_context *pipe,
    FREE(nv50_tic_entry(view));
 }
 
-static INLINE void
+static inline void
 nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
                              unsigned nr,
                              struct pipe_sampler_view **views)
@@ -808,7 +818,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
    pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res);
 
-   nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
+   nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? true : false;
    if (nv50->constbuf[s][i].user) {
       nv50->constbuf[s][i].u.data = cb->user_buffer;
       nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
@@ -1041,7 +1051,7 @@ nv50_so_target_create(struct pipe_context *pipe,
    } else {
       targ->pq = NULL;
    }
-   targ->clean = TRUE;
+   targ->clean = true;
 
    targ->pipe.buffer_size = size;
    targ->pipe.buffer_offset = offset;
@@ -1075,32 +1085,32 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    unsigned i;
-   boolean serialize = TRUE;
-   const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
+   bool serialize = true;
+   const bool can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
 
    assert(num_targets <= 4);
 
    for (i = 0; i < num_targets; ++i) {
-      const boolean changed = nv50->so_target[i] != targets[i];
-      const boolean append = (offsets[i] == (unsigned)-1);
+      const bool changed = nv50->so_target[i] != targets[i];
+      const bool append = (offsets[i] == (unsigned)-1);
       if (!changed && append)
          continue;
       nv50->so_targets_dirty |= 1 << i;
 
       if (can_resume && changed && nv50->so_target[i]) {
          nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
-         serialize = FALSE;
+         serialize = false;
       }
 
       if (targets[i] && !append)
-         nv50_so_target(targets[i])->clean = TRUE;
+         nv50_so_target(targets[i])->clean = true;
 
       pipe_so_target_reference(&nv50->so_target[i], targets[i]);
    }
    for (; i < nv50->num_so_targets; ++i) {
       if (can_resume && nv50->so_target[i]) {
          nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
-         serialize = FALSE;
+         serialize = false;
       }
       pipe_so_target_reference(&nv50->so_target[i], NULL);
       nv50->so_targets_dirty |= 1 << i;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 116bf4bba7c..985603df5fa 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -2,7 +2,7 @@
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_defs.xml.h"
 
-static INLINE void
+static inline void
 nv50_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i)
 {
    BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(i)), 4);
@@ -82,7 +82,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       ms_mode = mt->ms_mode;
 
       if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-         nv50->state.rt_serialize = TRUE;
+         nv50->state.rt_serialize = true;
       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -111,7 +111,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       ms_mode = mt->ms_mode;
 
       if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-         nv50->state.rt_serialize = TRUE;
+         nv50->state.rt_serialize = true;
       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -275,7 +275,7 @@ nv50_validate_viewport(struct nv50_context *nv50)
    nv50->viewports_dirty = 0;
 }
 
-static INLINE void
+static inline void
 nv50_check_program_ucps(struct nv50_context *nv50,
                         struct nv50_program *vp, uint8_t mask)
 {
@@ -296,6 +296,23 @@ nv50_check_program_ucps(struct nv50_context *nv50,
    nv50_fp_linkage_validate(nv50);
 }
 
+/* alpha test is disabled if there are no color RTs, so make sure we have at
+ * least one if alpha test is enabled. Note that this must run after
+ * nv50_validate_fb, otherwise that will override the RT count setting.
+ */
+static void
+nv50_validate_derived_2(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   if (nv50->zsa && nv50->zsa->pipe.alpha.enabled &&
+       nv50->framebuffer.nr_cbufs == 0) {
+      nv50_fb_set_null_rt(push, 0);
+      BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
+      PUSH_DATA (push, (076543210 << 4) | 1);
+   }
+}
+
 static void
 nv50_validate_clip(struct nv50_context *nv50)
 {
@@ -456,6 +473,7 @@ static struct state_validate {
     { nv50_gp_linkage_validate,    NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG },
     { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
+    { nv50_validate_derived_2,     NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
     { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
@@ -468,7 +486,7 @@ static struct state_validate {
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
 
-boolean
+bool
 nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words)
 {
    uint32_t state_mask;
@@ -490,19 +508,19 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words)
       nv50->dirty &= ~state_mask;
 
       if (nv50->state.rt_serialize) {
-         nv50->state.rt_serialize = FALSE;
+         nv50->state.rt_serialize = false;
          BEGIN_NV04(nv50->base.pushbuf, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
          PUSH_DATA (nv50->base.pushbuf, 0);
       }
 
-      nv50_bufctx_fence(nv50->bufctx_3d, FALSE);
+      nv50_bufctx_fence(nv50->bufctx_3d, false);
    }
    nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d);
    ret = nouveau_pushbuf_validate(nv50->base.pushbuf);
 
    if (unlikely(nv50->state.flushed)) {
-      nv50->state.flushed = FALSE;
-      nv50_bufctx_fence(nv50->bufctx_3d, TRUE);
+      nv50->state.flushed = false;
+      nv50_bufctx_fence(nv50->bufctx_3d, true);
    }
    return !ret;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
index eea5327b6cb..cf75d1eb11b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -31,7 +31,7 @@ struct nv50_rasterizer_stateobj {
 struct nv50_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
    int size;
-   uint32_t state[29];
+   uint32_t state[34];
 };
 
 struct nv50_constbuf {
@@ -41,7 +41,7 @@ struct nv50_constbuf {
    } u;
    uint32_t size; /* max 65536 */
    uint32_t offset;
-   boolean user; /* should only be TRUE if u.data is valid and non-NULL */
+   bool user; /* should only be true if u.data is valid and non-NULL */
 };
 
 struct nv50_vertex_element {
@@ -56,7 +56,7 @@ struct nv50_vertex_stateobj {
    unsigned num_elements;
    uint32_t instance_elts;
    uint32_t instance_bufs;
-   boolean need_conversion;
+   bool need_conversion;
    unsigned vertex_size;
    unsigned packet_vertex_limit;
    struct nv50_vertex_element element[0];
@@ -66,10 +66,10 @@ struct nv50_so_target {
    struct pipe_stream_output_target pipe;
    struct pipe_query *pq;
    unsigned stride;
-   boolean clean;
+   bool clean;
 };
 
-static INLINE struct nv50_so_target *
+static inline struct nv50_so_target *
 nv50_so_target(struct pipe_stream_output_target *ptarg)
 {
    return (struct nv50_so_target *)ptarg;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
index 99548cbdb42..e0793bb6ec4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
@@ -9,7 +9,7 @@ struct nv50_tsc_entry {
    uint32_t tsc[8];
 };
 
-static INLINE struct nv50_tsc_entry *
+static inline struct nv50_tsc_entry *
 nv50_tsc_entry(void *hwcso)
 {
    return (struct nv50_tsc_entry *)hwcso;
@@ -21,7 +21,7 @@ struct nv50_tic_entry {
    uint32_t tic[8];
 };
 
-static INLINE struct nv50_tic_entry *
+static inline struct nv50_tic_entry *
 nv50_tic_entry(struct pipe_sampler_view *view)
 {
    return (struct nv50_tic_entry *)view;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index dc9852d4e47..b1ae01692cb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -49,8 +49,8 @@
 #define NOUVEAU_DRIVER 0x50
 #include "nv50/nv50_blit.h"
 
-static INLINE uint8_t
-nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
+static inline uint8_t
+nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
 {
    uint8_t id = nv50_format_table[format].rt;
 
@@ -76,7 +76,7 @@ nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
 static int
 nv50_2d_texture_set(struct nouveau_pushbuf *push, int dst,
                     struct nv50_miptree *mt, unsigned level, unsigned layer,
-                    enum pipe_format pformat, boolean dst_src_pformat_equal)
+                    enum pipe_format pformat, bool dst_src_pformat_equal)
 {
    struct nouveau_bo *bo = mt->base.bo;
    uint32_t width, height, depth;
@@ -153,7 +153,7 @@ nv50_2d_texture_do_copy(struct nouveau_pushbuf *push,
    const enum pipe_format dfmt = dst->base.base.format;
    const enum pipe_format sfmt = src->base.base.format;
    int ret;
-   boolean eqfmt = dfmt == sfmt;
+   bool eqfmt = dfmt == sfmt;
 
    if (!PUSH_SPACE(push, 2 * 16 + 32))
       return PIPE_ERROR;
@@ -196,7 +196,7 @@ nv50_resource_copy_region(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    int ret;
-   boolean m2mf;
+   bool m2mf;
    unsigned dst_layer = dstz, src_layer = src_box->z;
 
    if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
@@ -658,7 +658,7 @@ nv50_blitter_make_vp(struct nv50_blitter *blit)
    };
 
    blit->vp.type = PIPE_SHADER_VERTEX;
-   blit->vp.translated = TRUE;
+   blit->vp.translated = true;
    blit->vp.code = (uint32_t *)code; /* const_cast */
    blit->vp.code_size = sizeof(code);
    blit->vp.max_gpr = 4;
@@ -687,24 +687,24 @@ nv50_blitter_make_fp(struct pipe_context *pipe,
 
    const unsigned target = nv50_blit_get_tgsi_texture_target(ptarg);
 
-   boolean tex_rgbaz = FALSE;
-   boolean tex_s = FALSE;
-   boolean cvt_un8 = FALSE;
+   bool tex_rgbaz = false;
+   bool tex_s = false;
+   bool cvt_un8 = false;
 
    if (mode != NV50_BLIT_MODE_PASS &&
        mode != NV50_BLIT_MODE_Z24X8 &&
        mode != NV50_BLIT_MODE_X8Z24)
-      tex_s = TRUE;
+      tex_s = true;
 
    if (mode != NV50_BLIT_MODE_X24S8 &&
        mode != NV50_BLIT_MODE_S8X24 &&
        mode != NV50_BLIT_MODE_XS)
-      tex_rgbaz = TRUE;
+      tex_rgbaz = true;
 
    if (mode != NV50_BLIT_MODE_PASS &&
        mode != NV50_BLIT_MODE_ZS &&
        mode != NV50_BLIT_MODE_XS)
-      cvt_un8 = TRUE;
+      cvt_un8 = true;
 
    ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
    if (!ureg)
@@ -1271,7 +1271,7 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
    int i;
    uint32_t mode;
    uint32_t mask = nv50_blit_eng2d_get_mask(info);
-   boolean b;
+   bool b;
 
    mode = nv50_blit_get_filter(info) ?
       NV50_2D_BLIT_CONTROL_FILTER_BILINEAR :
@@ -1410,7 +1410,7 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
          PUSH_DATA (push, srcy >> 32);
       }
    }
-   nv50_bufctx_fence(nv50->bufctx, FALSE);
+   nv50_bufctx_fence(nv50->bufctx, false);
 
    nouveau_bufctx_reset(nv50->bufctx, NV50_BIND_2D);
 
@@ -1432,71 +1432,82 @@ static void
 nv50_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
-   boolean eng3d = FALSE;
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   bool eng3d = FALSE;
 
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
       if (!(info->mask & PIPE_MASK_ZS))
          return;
       if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT ||
           info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
-         eng3d = TRUE;
+         eng3d = true;
       if (info->filter != PIPE_TEX_FILTER_NEAREST)
-         eng3d = TRUE;
+         eng3d = true;
    } else {
       if (!(info->mask & PIPE_MASK_RGBA))
          return;
       if (info->mask != PIPE_MASK_RGBA)
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (nv50_miptree(info->src.resource)->layout_3d) {
-      eng3d = TRUE;
+      eng3d = true;
    } else
    if (info->src.box.depth != info->dst.box.depth) {
-      eng3d = TRUE;
+      eng3d = true;
       debug_printf("blit: cannot filter array or cube textures in z direction");
    }
 
    if (!eng3d && info->dst.format != info->src.format) {
       if (!nv50_2d_dst_format_faithful(info->dst.format) ||
           !nv50_2d_src_format_faithful(info->src.format)) {
-         eng3d = TRUE;
+         eng3d = true;
       } else
       if (!nv50_2d_src_format_faithful(info->src.format)) {
          if (!util_format_is_luminance(info->src.format)) {
             if (util_format_is_intensity(info->src.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
             if (!nv50_2d_dst_format_ops_supported(info->dst.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
                eng3d = !nv50_2d_format_supported(info->src.format);
          }
       } else
       if (util_format_is_luminance_alpha(info->src.format))
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (info->src.resource->nr_samples == 8 &&
        info->dst.resource->nr_samples <= 1)
-      eng3d = TRUE;
+      eng3d = true;
 
    /* FIXME: can't make this work with eng2d anymore */
    if ((info->src.resource->nr_samples | 1) !=
        (info->dst.resource->nr_samples | 1))
-      eng3d = TRUE;
+      eng3d = true;
 
    /* FIXME: find correct src coordinate adjustments */
    if ((info->src.box.width !=  info->dst.box.width &&
         info->src.box.width != -info->dst.box.width) ||
        (info->src.box.height !=  info->dst.box.height &&
         info->src.box.height != -info->dst.box.height))
-      eng3d = TRUE;
+      eng3d = true;
+
+   if (nv50->screen->num_occlusion_queries_active) {
+      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+      PUSH_DATA (push, 0);
+   }
 
    if (!eng3d)
       nv50_blit_eng2d(nv50, info);
    else
       nv50_blit_3d(nv50, info);
+
+   if (nv50->screen->num_occlusion_queries_active) {
+      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+      PUSH_DATA (push, 1);
+   }
 }
 
 static void
@@ -1505,13 +1516,13 @@ nv50_flush_resource(struct pipe_context *ctx,
 {
 }
 
-boolean
+bool
 nv50_blitter_create(struct nv50_screen *screen)
 {
    screen->blitter = CALLOC_STRUCT(nv50_blitter);
    if (!screen->blitter) {
       NOUVEAU_ERR("failed to allocate blitter struct\n");
-      return FALSE;
+      return false;
    }
 
    pipe_mutex_init(screen->blitter->mutex);
@@ -1519,7 +1530,7 @@ nv50_blitter_create(struct nv50_screen *screen)
    nv50_blitter_make_vp(screen->blitter);
    nv50_blitter_make_sampler(screen->blitter);
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -1542,20 +1553,20 @@ nv50_blitter_destroy(struct nv50_screen *screen)
    FREE(blitter);
 }
 
-boolean
+bool
 nv50_blitctx_create(struct nv50_context *nv50)
 {
    nv50->blit = CALLOC_STRUCT(nv50_blitctx);
    if (!nv50->blit) {
       NOUVEAU_ERR("failed to allocate blit context\n");
-      return FALSE;
+      return false;
    }
 
    nv50->blit->nv50 = nv50;
 
    nv50->blit->rast.pipe.half_pixel_center = 1;
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index d69c8d6ff0d..fc6374d1b1b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -31,8 +31,8 @@
    (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK |   \
     NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
 
-static INLINE uint32_t
-nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int)
+static inline uint32_t
+nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int)
 {
    switch (swz) {
    case PIPE_SWIZZLE_RED:
@@ -71,6 +71,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
                          uint32_t flags,
                          enum pipe_texture_target target)
 {
+   const uint32_t class_3d = nouveau_context(pipe)->screen->class_3d;
    const struct util_format_description *desc;
    uint64_t addr;
    uint32_t *tic;
@@ -78,7 +79,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
    uint32_t depth;
    struct nv50_tic_entry *view;
    struct nv50_miptree *mt = nv50_miptree(texture);
-   boolean tex_int;
+   bool tex_int;
 
    view = MALLOC_STRUCT(nv50_tic_entry);
    if (!view)
@@ -192,7 +193,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
       break;
    default:
       NOUVEAU_ERR("invalid texture target: %d\n", mt->base.base.target);
-      return FALSE;
+      return false;
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
@@ -201,11 +202,17 @@ nv50_create_texture_view(struct pipe_context *pipe,
 
    tic[5] = (mt->base.base.height0 << mt->ms_y) & 0xffff;
    tic[5] |= depth << 16;
-   tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
+   if (class_3d > NV50_3D_CLASS)
+      tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
+   else
+      tic[5] |= view->pipe.u.tex.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
 
    tic[6] = (mt->ms_x > 1) ? 0x88000000 : 0x03000000; /* sampling points */
 
-   tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level;
+   if (class_3d > NV50_3D_CLASS)
+      tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level;
+   else
+      tic[7] = 0;
 
    if (unlikely(!(tic[2] & NV50_TIC_2_NORMALIZED_COORDS)))
       if (mt->base.base.last_level)
@@ -214,13 +221,13 @@ nv50_create_texture_view(struct pipe_context *pipe,
    return &view->pipe;
 }
 
-static boolean
+static bool
 nv50_validate_tic(struct nv50_context *nv50, int s)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nouveau_bo *txc = nv50->screen->txc;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    assert(nv50->num_textures[s] <= PIPE_MAX_SAMPLERS);
    for (i = 0; i < nv50->num_textures[s]; ++i) {
@@ -263,7 +270,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s)
          BEGIN_NI04(push, NV50_2D(SIFC_DATA), 8);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
@@ -309,7 +316,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s)
 
 void nv50_validate_textures(struct nv50_context *nv50)
 {
-   boolean need_flush;
+   bool need_flush;
 
    need_flush  = nv50_validate_tic(nv50, 0);
    need_flush |= nv50_validate_tic(nv50, 1);
@@ -321,12 +328,12 @@ void nv50_validate_textures(struct nv50_context *nv50)
    }
 }
 
-static boolean
+static bool
 nv50_validate_tsc(struct nv50_context *nv50, int s)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    assert(nv50->num_samplers[s] <= PIPE_MAX_SAMPLERS);
    for (i = 0; i < nv50->num_samplers[s]; ++i) {
@@ -343,7 +350,7 @@ nv50_validate_tsc(struct nv50_context *nv50, int s)
          nv50_sifc_linear_u8(&nv50->base, nv50->screen->txc,
                              65536 + tsc->id * 32,
                              NOUVEAU_BO_VRAM, 32, tsc->tsc);
-         need_flush = TRUE;
+         need_flush = true;
       }
       nv50->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -361,7 +368,7 @@ nv50_validate_tsc(struct nv50_context *nv50, int s)
 
 void nv50_validate_samplers(struct nv50_context *nv50)
 {
-   boolean need_flush;
+   bool need_flush;
 
    need_flush  = nv50_validate_tsc(nv50, 0);
    need_flush |= nv50_validate_tsc(nv50, 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 1fd33b8aa59..6324726acec 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -58,7 +58,7 @@ nv50_vertex_state_create(struct pipe_context *pipe,
     so->num_elements = num_elements;
     so->instance_elts = 0;
     so->instance_bufs = 0;
-    so->need_conversion = FALSE;
+    so->need_conversion = false;
 
     memset(so->vb_access_size, 0, sizeof(so->vb_access_size));
 
@@ -89,7 +89,7 @@ nv50_vertex_state_create(struct pipe_context *pipe,
                 return NULL;
             }
             so->element[i].state = nv50_format_table[fmt].vtx;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
         so->element[i].state |= i;
 
@@ -188,7 +188,7 @@ nv50_emit_vtxattr(struct nv50_context *nv50, struct pipe_vertex_buffer *vb,
    }
 }
 
-static INLINE void
+static inline void
 nv50_user_vbuf_range(struct nv50_context *nv50, unsigned vbi,
                      uint32_t *base, uint32_t *size)
 {
@@ -229,7 +229,7 @@ nv50_upload_user_buffers(struct nv50_context *nv50,
          BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, NOUVEAU_BO_GART |
                       NOUVEAU_BO_RD, bo);
    }
-   nv50->base.vbo_dirty = TRUE;
+   nv50->base.vbo_dirty = true;
 }
 
 static void
@@ -275,10 +275,10 @@ nv50_update_user_vbufs(struct nv50_context *nv50)
       PUSH_DATAh(push, address[b] + ve->src_offset);
       PUSH_DATA (push, address[b] + ve->src_offset);
    }
-   nv50->base.vbo_dirty = TRUE;
+   nv50->base.vbo_dirty = true;
 }
 
-static INLINE void
+static inline void
 nv50_release_user_vbufs(struct nv50_context *nv50)
 {
    if (nv50->vbo_user) {
@@ -316,7 +316,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
          struct nv04_resource *buf = nv04_resource(nv50->vtxbuf[i].buffer);
          if (buf && buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
             buf->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
-            nv50->base.vbo_dirty = TRUE;
+            nv50->base.vbo_dirty = true;
             break;
          }
       }
@@ -382,6 +382,11 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
       if (nv50->vbo_user & (1 << b)) {
          address = addrs[b] + ve->pipe.src_offset;
          limit = addrs[b] + limits[b];
+      } else
+      if (!vb->buffer) {
+         BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 1);
+         PUSH_DATA (push, 0);
+         continue;
       } else {
          struct nv04_resource *buf = nv04_resource(vb->buffer);
          if (!(refd & (1 << b))) {
@@ -418,7 +423,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
 #define NV50_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nv50_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -585,7 +590,7 @@ nv50_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
+nv50_draw_elements(struct nv50_context *nv50, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -746,9 +751,9 @@ nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan)
 {
    struct nv50_screen *screen = chan->user_priv;
 
-   nouveau_fence_update(&screen->base, TRUE);
+   nouveau_fence_update(&screen->base, true);
 
-   nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, TRUE);
+   nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, true);
 }
 
 void
@@ -801,7 +806,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             continue;
 
          if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nv50->cb_dirty = TRUE;
+            nv50->cb_dirty = true;
       }
    }
 
@@ -809,7 +814,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv50->cb_dirty) {
       BEGIN_NV04(push, NV50_3D(CODE_CB_FLUSH), 1);
       PUSH_DATA (push, 0);
-      nv50->cb_dirty = FALSE;
+      nv50->cb_dirty = false;
    }
 
    if (nv50->vbo_fifo) {
@@ -830,21 +835,21 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nv50->vtxbuf[i].buffer)
          continue;
       if (nv50->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nv50->base.vbo_dirty = TRUE;
+         nv50->base.vbo_dirty = true;
    }
 
    if (!nv50->base.vbo_dirty && nv50->idxbuf.buffer &&
        nv50->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nv50->base.vbo_dirty = TRUE;
+      nv50->base.vbo_dirty = true;
 
    if (nv50->base.vbo_dirty) {
       BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
       PUSH_DATA (push, 0);
-      nv50->base.vbo_dirty = FALSE;
+      nv50->base.vbo_dirty = false;
    }
 
    if (info->indexed) {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart != nv50->state.prim_restart) {
          if (info->primitive_restart) {
@@ -853,7 +858,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             PUSH_DATA (push, info->restart_index);
 
             if (info->restart_index > 65535)
-               shorten = FALSE;
+               shorten = false;
          } else {
             BEGIN_NV04(push, NV50_3D(PRIM_RESTART_ENABLE), 1);
             PUSH_DATA (push, 0);
@@ -865,7 +870,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
          PUSH_DATA (push, info->restart_index);
 
          if (info->restart_index > 65535)
-            shorten = FALSE;
+            shorten = false;
       }
 
       nv50_draw_elements(nv50, shorten,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
index e8578c8be6f..76f1b41ea70 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
@@ -16,14 +16,14 @@
 #endif
 
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin,
                             unsigned flags, struct nouveau_bo *bo)
 {
    nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL;
 }
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin,
                          struct nv04_resource *res, unsigned flags)
 {
@@ -39,7 +39,7 @@ nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin,
 #define BCTX_REFN(bctx, bin, res, acc) \
    nv50_add_bufctx_resident(bctx, NV50_BIND_##bin, res, NOUVEAU_BO_##acc)
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 {
    struct nouveau_pushbuf_refn ref = { bo, flags };
@@ -61,39 +61,39 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define NV50_COMPUTE(n) SUBC_COMPUTE(NV50_COMPUTE_##n)
 
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR(int subc, int mthd, unsigned size)
 {
    return 0x00000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x40000000 | (size << 18) | (subc << 13) | mthd;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NV50_FIFO_PKHDR_L(int subc, int mthd)
 {
    return 0x00030000 | (subc << 13) | mthd;
 }
 
 
-static INLINE uint32_t
+static inline uint32_t
 nouveau_bo_memtype(const struct nouveau_bo *bo)
 {
    return bo->config.nv50.memtype;
 }
 
 
-static INLINE void
+static inline void
 PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data)
 {
    *push->cur++ = (uint32_t)(data >> 32);
 }
 
-static INLINE void
+static inline void
 BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
@@ -102,7 +102,7 @@ BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NV50_FIFO_PKHDR(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
@@ -112,7 +112,7 @@ BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 }
 
 /* long, non-incremental, nv50-only */
-static INLINE void
+static inline void
 BEGIN_NL50(struct nouveau_pushbuf *push, int subc, int mthd, uint32_t size)
 {
 #ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.h b/src/gallium/drivers/nouveau/nv50/nv84_video.h
index 2edba389dbf..09773c12974 100644
--- a/src/gallium/drivers/nouveau/nv50/nv84_video.h
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video.h
@@ -102,12 +102,12 @@ struct nv84_decoder {
    uint8_t mpeg12_non_intra_matrix[64];
 };
 
-static INLINE uint32_t mb(uint32_t coord)
+static inline uint32_t mb(uint32_t coord)
 {
    return (coord + 0xf)>>4;
 }
 
-static INLINE uint32_t mb_half(uint32_t coord)
+static inline uint32_t mb_half(uint32_t coord)
 {
    return (coord + 0x1f)>>5;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
index f3480b2e00e..8b121477a37 100644
--- a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
@@ -221,7 +221,7 @@ nv84_decoder_vp_h264(struct nv84_decoder *dec,
    PUSH_KICK (push);
 }
 
-static INLINE int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
+static inline int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
    int16_t ret = val * quant / 16;
    if (mpeg1 && ret) {
       if (ret > 0)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index 56fc83d3679..47bd123621b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -121,51 +121,51 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen,
    return 0;
 }
 
-boolean
+bool
 nvc0_compute_validate_program(struct nvc0_context *nvc0)
 {
    struct nvc0_program *prog = nvc0->compprog;
 
    if (prog->mem)
-      return TRUE;
+      return true;
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
          prog, nvc0->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    }
    if (unlikely(!prog->code_size))
-      return FALSE;
+      return false;
 
    if (likely(prog->code_size)) {
       if (nvc0_program_upload_code(nvc0, prog)) {
          struct nouveau_pushbuf *push = nvc0->base.pushbuf;
          BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
          PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE);
-         return TRUE;
+         return true;
       }
    }
-   return FALSE;
+   return false;
 }
 
-static boolean
+static bool
 nvc0_compute_state_validate(struct nvc0_context *nvc0)
 {
    if (!nvc0_compute_validate_program(nvc0))
-      return FALSE;
+      return false;
 
    /* TODO: textures, samplers, surfaces, global memory buffers */
 
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
    if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return FALSE;
+      return false;
    if (unlikely(nvc0->state.flushed))
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
 
-   return TRUE;
+   return true;
 
 }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
index 9a1a71760d7..168a6d1bee2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
@@ -4,7 +4,7 @@
 #include "nv50/nv50_defs.xml.h"
 #include "nvc0/nvc0_compute.xml.h"
 
-boolean
+bool
 nvc0_compute_validate_program(struct nvc0_context *nvc0);
 
 #endif /* NVC0_COMPUTE_H */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index a35c3f66142..84f8db6a8ac 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -63,12 +63,12 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
          if (!nvc0->vtxbuf[i].buffer)
             continue;
          if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-            nvc0->base.vbo_dirty = TRUE;
+            nvc0->base.vbo_dirty = true;
       }
 
       if (nvc0->idxbuf.buffer &&
           nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-         nvc0->base.vbo_dirty = TRUE;
+         nvc0->base.vbo_dirty = true;
 
       for (s = 0; s < 5 && !nvc0->cb_dirty; ++s) {
          uint32_t valid = nvc0->constbuf_valid[s];
@@ -86,7 +86,7 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
                continue;
 
             if (res->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT)
-               nvc0->cb_dirty = TRUE;
+               nvc0->cb_dirty = true;
          }
       }
    }
@@ -164,9 +164,9 @@ nvc0_default_kick_notify(struct nouveau_pushbuf *push)
 
    if (screen) {
       nouveau_fence_next(&screen->base);
-      nouveau_fence_update(&screen->base, TRUE);
+      nouveau_fence_update(&screen->base, true);
       if (screen->cur_ctx)
-         screen->cur_ctx->state.flushed = TRUE;
+         screen->cur_ctx->state.flushed = true;
       NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1);
    }
 }
@@ -378,7 +378,7 @@ out_err:
 
 void
 nvc0_bufctx_fence(struct nvc0_context *nvc0, struct nouveau_bufctx *bufctx,
-                  boolean on_flush)
+                  bool on_flush)
 {
    struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending;
    struct nouveau_list *it;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index a8d7593b398..f4499423a10 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -54,6 +54,7 @@
 #define NVC0_NEW_IDXBUF       (1 << 22)
 #define NVC0_NEW_SURFACES     (1 << 23)
 #define NVC0_NEW_MIN_SAMPLES  (1 << 24)
+#define NVC0_NEW_TESSFACTOR   (1 << 25)
 
 #define NVC0_NEW_CP_PROGRAM   (1 << 0)
 #define NVC0_NEW_CP_SURFACES  (1 << 1)
@@ -93,7 +94,7 @@
 
 struct nvc0_blitctx;
 
-boolean nvc0_blitctx_create(struct nvc0_context *);
+bool nvc0_blitctx_create(struct nvc0_context *);
 void nvc0_blitctx_destroy(struct nvc0_context *);
 
 struct nvc0_context {
@@ -130,7 +131,7 @@ struct nvc0_context {
    struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[6];
    uint16_t constbuf_valid[6];
-   boolean cb_dirty;
+   bool cb_dirty;
 
    struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
    unsigned num_vtxbufs;
@@ -164,14 +165,17 @@ struct nvc0_context {
    unsigned sample_mask;
    unsigned min_samples;
 
-   boolean vbo_push_hint;
+   float default_tess_outer[4];
+   float default_tess_inner[2];
+
+   bool vbo_push_hint;
 
    uint8_t tfbbuf_dirty;
    struct pipe_stream_output_target *tfbbuf[4];
    unsigned num_tfbbufs;
 
    struct pipe_query *cond_query;
-   boolean cond_cond; /* inverted rendering condition */
+   bool cond_cond; /* inverted rendering condition */
    uint cond_mode;
    uint32_t cond_condmode; /* the calculated condition */
 
@@ -184,19 +188,19 @@ struct nvc0_context {
    struct util_dynarray global_residents;
 };
 
-static INLINE struct nvc0_context *
+static inline struct nvc0_context *
 nvc0_context(struct pipe_context *pipe)
 {
    return (struct nvc0_context *)pipe;
 }
 
-static INLINE unsigned
+static inline unsigned
 nvc0_shader_stage(unsigned pipe)
 {
    switch (pipe) {
    case PIPE_SHADER_VERTEX: return 0;
-/* case PIPE_SHADER_TESSELLATION_CONTROL: return 1; */
-/* case PIPE_SHADER_TESSELLATION_EVALUATION: return 2; */
+   case PIPE_SHADER_TESS_CTRL: return 1;
+   case PIPE_SHADER_TESS_EVAL: return 2;
    case PIPE_SHADER_GEOMETRY: return 3;
    case PIPE_SHADER_FRAGMENT: return 4;
    case PIPE_SHADER_COMPUTE: return 5;
@@ -210,15 +214,15 @@ nvc0_shader_stage(unsigned pipe)
 /* nvc0_context.c */
 struct pipe_context *nvc0_create(struct pipe_screen *, void *);
 void nvc0_bufctx_fence(struct nvc0_context *, struct nouveau_bufctx *,
-                       boolean on_flush);
+                       bool on_flush);
 void nvc0_default_kick_notify(struct nouveau_pushbuf *);
 
 /* nvc0_draw.c */
 extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);
 
 /* nvc0_program.c */
-boolean nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
-boolean nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
+bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
+bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
 uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
@@ -231,7 +235,7 @@ void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *,
 void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
 void nvc0_so_target_save_offset(struct pipe_context *,
                                 struct pipe_stream_output_target *, unsigned i,
-                                boolean *serialize);
+                                bool *serialize);
 
 #define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
@@ -250,8 +254,8 @@ extern void nvc0_init_state_functions(struct nvc0_context *);
 /* nvc0_state_validate.c */
 void nvc0_validate_global_residents(struct nvc0_context *,
                                     struct nouveau_bufctx *, int bin);
-extern boolean nvc0_state_validate(struct nvc0_context *, uint32_t state_mask,
-                                   unsigned space_words);
+extern bool nvc0_state_validate(struct nvc0_context *, uint32_t state_mask,
+                                unsigned space_words);
 
 /* nvc0_surface.c */
 extern void nvc0_clear(struct pipe_context *, unsigned buffers,
@@ -260,7 +264,7 @@ extern void nvc0_clear(struct pipe_context *, unsigned buffers,
 extern void nvc0_init_surface_functions(struct nvc0_context *);
 
 /* nvc0_tex.c */
-boolean nve4_validate_tsc(struct nvc0_context *nvc0, int s);
+bool nve4_validate_tsc(struct nvc0_context *nvc0, int s);
 void nvc0_validate_textures(struct nvc0_context *);
 void nvc0_validate_samplers(struct nvc0_context *);
 void nve4_set_tex_handles(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
index 3875bbf4ca4..15991c3d2bd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -29,13 +29,13 @@
 #include "nvc0/nvc0_resource.h"
 
 static uint32_t
-nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, boolean is_3d)
+nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d)
 {
    return nv50_tex_choose_tile_dims_helper(nx, ny, nz, is_3d);
 }
 
 static uint32_t
-nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
+nvc0_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
 {
    const unsigned ms = util_logbase2(mt->base.base.nr_samples);
 
@@ -133,7 +133,7 @@ nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
    return tile_flags;
 }
 
-static INLINE boolean
+static inline bool
 nvc0_miptree_init_ms_mode(struct nv50_miptree *mt)
 {
    switch (mt->base.base.nr_samples) {
@@ -157,9 +157,9 @@ nvc0_miptree_init_ms_mode(struct nv50_miptree *mt)
       break;
    default:
       NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples);
-      return FALSE;
+      return false;
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -250,7 +250,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   boolean compressed = dev->drm_version >= 0x01000101;
+   bool compressed = dev->drm_version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
@@ -325,7 +325,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
 }
 
 /* Offset of zslice @z from start of level @l. */
-INLINE unsigned
+inline unsigned
 nvc0_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z)
 {
    const struct pipe_resource *pt = &mt->base.base;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index e1f5a8c4416..507a2507fe3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -31,24 +31,25 @@
  * 124 scalar varying values.
  */
 static uint32_t
-nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
+nvc0_shader_input_address(unsigned sn, unsigned si)
 {
    switch (sn) {
-   case NV50_SEMANTIC_TESSFACTOR:   return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:        return 0x020 + si * 0x10;
    case TGSI_SEMANTIC_PRIMID:       return 0x060;
    case TGSI_SEMANTIC_LAYER:        return 0x064;
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
    case TGSI_SEMANTIC_PSIZE:        return 0x06c;
    case TGSI_SEMANTIC_POSITION:     return 0x070;
-   case TGSI_SEMANTIC_GENERIC:      return ubase + si * 0x10;
+   case TGSI_SEMANTIC_GENERIC:      return 0x080 + si * 0x10;
    case TGSI_SEMANTIC_FOG:          return 0x2e8;
    case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
    case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
-   case NV50_SEMANTIC_CLIPDISTANCE: return 0x2c0 + si * 0x4;
    case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
    case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
    case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
-   case NV50_SEMANTIC_TESSCOORD:    return 0x2f0;
+   case TGSI_SEMANTIC_TESSCOORD:    return 0x2f0;
    case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
    case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
    case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
@@ -60,20 +61,21 @@ nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
 }
 
 static uint32_t
-nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase)
+nvc0_shader_output_address(unsigned sn, unsigned si)
 {
    switch (sn) {
-   case NV50_SEMANTIC_TESSFACTOR:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:         return 0x020 + si * 0x10;
    case TGSI_SEMANTIC_PRIMID:        return 0x060;
    case TGSI_SEMANTIC_LAYER:         return 0x064;
    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
    case TGSI_SEMANTIC_PSIZE:         return 0x06c;
    case TGSI_SEMANTIC_POSITION:      return 0x070;
-   case TGSI_SEMANTIC_GENERIC:       return ubase + si * 0x10;
+   case TGSI_SEMANTIC_GENERIC:       return 0x080 + si * 0x10;
    case TGSI_SEMANTIC_FOG:           return 0x2e8;
    case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
    case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
-   case NV50_SEMANTIC_CLIPDISTANCE:  return 0x2c0 + si * 0x4;
    case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
    case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
    case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;
@@ -95,7 +97,7 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
       case TGSI_SEMANTIC_VERTEXID:
          info->in[i].mask = 0x1;
          info->in[i].slot[0] =
-            nvc0_shader_input_address(info->in[i].sn, 0, 0) / 4;
+            nvc0_shader_input_address(info->in[i].sn, 0) / 4;
          continue;
       default:
          break;
@@ -111,18 +113,11 @@ nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
 static int
 nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info)
 {
-   unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10);
    unsigned offset;
    unsigned i, c;
 
    for (i = 0; i < info->numInputs; ++i) {
-      offset = nvc0_shader_input_address(info->in[i].sn,
-                                         info->in[i].si, ubase);
-      if (info->in[i].patch && offset >= 0x20)
-         offset = 0x20 + info->in[i].si * 0x10;
-
-      if (info->in[i].sn == NV50_SEMANTIC_TESSCOORD)
-         info->in[i].mask &= 3;
+      offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
 
       for (c = 0; c < 4; ++c)
          info->in[i].slot[c] = (offset + c * 0x4) / 4;
@@ -157,15 +152,11 @@ nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info)
 static int
 nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info)
 {
-   unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10);
    unsigned offset;
    unsigned i, c;
 
    for (i = 0; i < info->numOutputs; ++i) {
-      offset = nvc0_shader_output_address(info->out[i].sn,
-                                          info->out[i].si, ubase);
-      if (info->out[i].patch && offset >= 0x20)
-         offset = 0x20 + info->out[i].si * 0x10;
+      offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
 
       for (c = 0; c < 4; ++c)
          info->out[i].slot[c] = (offset + c * 0x4) / 4;
@@ -193,7 +184,7 @@ nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info)
    return ret;
 }
 
-static INLINE void
+static inline void
 nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot)
 {
    uint8_t min = (vp->hdr[4] >> 12) & 0xff;
@@ -216,12 +207,8 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
          continue;
       for (c = 0; c < 4; ++c) {
          a = info->in[i].slot[c];
-         if (info->in[i].mask & (1 << c)) {
-            if (info->in[i].sn != NV50_SEMANTIC_TESSCOORD)
-               vp->hdr[5 + a / 32] |= 1 << (a % 32);
-            else
-               nvc0_vtgp_hdr_update_oread(vp, info->in[i].slot[c]);
-         }
+         if (info->in[i].mask & (1 << c))
+            vp->hdr[5 + a / 32] |= 1 << (a % 32);
       }
    }
 
@@ -250,6 +237,14 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
       case TGSI_SEMANTIC_VERTEXID:
          vp->hdr[10] |= 1 << 31;
          break;
+      case TGSI_SEMANTIC_TESSCOORD:
+         /* We don't have the mask, nor the slots populated. While this could
+          * be achieved, the vast majority of the time if either of the coords
+          * are read, then both will be read.
+          */
+         nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4);
+         nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4);
+         break;
       default:
          break;
       }
@@ -277,7 +272,6 @@ nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
    return nvc0_vtgp_gen_header(vp, info);
 }
 
-#if defined(PIPE_SHADER_HULL) || defined(PIPE_SHADER_DOMAIN)
 static void
 nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
 {
@@ -305,14 +299,13 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
 
    switch (info->prop.tp.partitioning) {
-   case PIPE_TESS_PART_INTEGER:
-   case PIPE_TESS_PART_POW2:
+   case PIPE_TESS_SPACING_EQUAL:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL;
       break;
-   case PIPE_TESS_PART_FRACT_ODD:
+   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD;
       break;
-   case PIPE_TESS_PART_FRACT_EVEN:
+   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN;
       break;
    default:
@@ -320,9 +313,7 @@ nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
       break;
    }
 }
-#endif
 
-#ifdef PIPE_SHADER_HULL
 static int
 nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
 {
@@ -346,9 +337,7 @@ nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
 
    return 0;
 }
-#endif
 
-#ifdef PIPE_SHADER_DOMAIN
 static int
 nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
 {
@@ -365,7 +354,6 @@ nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
 
    return 0;
 }
-#endif
 
 static int
 nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info)
@@ -523,7 +511,7 @@ nvc0_program_dump(struct nvc0_program *prog)
 }
 #endif
 
-boolean
+bool
 nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
 {
    struct nv50_ir_prog_info *info;
@@ -531,7 +519,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
 
    info = CALLOC_STRUCT(nv50_ir_prog_info);
    if (!info)
-      return FALSE;
+      return false;
 
    info->type = prog->type;
    info->target = chipset;
@@ -598,16 +586,12 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    case PIPE_SHADER_VERTEX:
       ret = nvc0_vp_gen_header(prog, info);
       break;
-#ifdef PIPE_SHADER_HULL
-   case PIPE_SHADER_HULL:
+   case PIPE_SHADER_TESS_CTRL:
       ret = nvc0_tcp_gen_header(prog, info);
       break;
-#endif
-#ifdef PIPE_SHADER_DOMAIN
-   case PIPE_SHADER_DOMAIN:
+   case PIPE_SHADER_TESS_EVAL:
       ret = nvc0_tep_gen_header(prog, info);
       break;
-#endif
    case PIPE_SHADER_GEOMETRY:
       ret = nvc0_gp_gen_header(prog, info);
       break;
@@ -630,7 +614,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
       assert(info->bin.tlsSpace < (1 << 24));
       prog->hdr[0] |= 1 << 26;
       prog->hdr[1] |= align(info->bin.tlsSpace, 0x10); /* l[] size */
-      prog->need_tls = TRUE;
+      prog->need_tls = true;
    }
    /* TODO: factor 2 only needed where joinat/precont is used,
     *       and we only have to count non-uniform branches
@@ -638,7 +622,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    /*
    if ((info->maxCFDepth * 2) > 16) {
       prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200;
-      prog->need_tls = TRUE;
+      prog->need_tls = true;
    }
    */
    if (info->io.globalAccess)
@@ -655,11 +639,11 @@ out:
    return !ret;
 }
 
-boolean
+bool
 nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
    struct nvc0_screen *screen = nvc0->screen;
-   const boolean is_cp = prog->type == PIPE_SHADER_COMPUTE;
+   const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
    int ret;
    uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
    uint32_t lib_pos = screen->lib_code->start;
@@ -694,7 +678,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
       if (ret) {
          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
-         return FALSE;
+         return false;
       }
       IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0);
    }
@@ -729,7 +713,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
       nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0);
 
 #ifdef DEBUG
-   if (debug_get_bool_option("NV50_PROG_DEBUG", FALSE))
+   if (debug_get_bool_option("NV50_PROG_DEBUG", false))
       nvc0_program_dump(prog);
 #endif
 
@@ -746,7 +730,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
    BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);
    PUSH_DATA (nvc0->base.pushbuf, 0x1011);
 
-   return TRUE;
+   return true;
 }
 
 /* Upload code for builtin functions like integer division emulation. */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index 3fd9d21b4c4..390e0c7a4f0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -21,8 +21,8 @@ struct nvc0_program {
    struct pipe_shader_state pipe;
 
    ubyte type;
-   boolean translated;
-   boolean need_tls;
+   bool translated;
+   bool need_tls;
    uint8_t num_gprs;
 
    uint32_t *code;
@@ -41,7 +41,7 @@ struct nvc0_program {
       uint8_t clip_enable; /* mask of defined clip planes */
       uint8_t num_ucps; /* also set to max if ClipDistance is used */
       uint8_t edgeflag; /* attribute index of edgeflag input */
-      boolean need_vertex_id;
+      bool need_vertex_id;
    } vp;
    struct {
       uint8_t early_z;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index aea6cbda02d..f7b85a8e931 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -44,7 +44,7 @@ struct nvc0_query {
    uint32_t base;
    uint32_t offset; /* base + i * rotate */
    uint8_t state;
-   boolean is64bit;
+   bool is64bit;
    uint8_t rotate;
    int nesting; /* only used for occlusion queries */
    union {
@@ -62,13 +62,13 @@ static void nvc0_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
 static boolean nvc0_mp_pm_query_result(struct nvc0_context *,
                                        struct nvc0_query *, void *, boolean);
 
-static INLINE struct nvc0_query *
+static inline struct nvc0_query *
 nvc0_query(struct pipe_query *pipe)
 {
    return (struct nvc0_query *)pipe;
 }
 
-static boolean
+static bool
 nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
 {
    struct nvc0_screen *screen = nvc0->screen;
@@ -87,17 +87,17 @@ nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
    if (size) {
       q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
       if (!q->bo)
-         return FALSE;
+         return false;
       q->offset = q->base;
 
       ret = nouveau_bo_map(q->bo, 0, screen->base.client);
       if (ret) {
          nvc0_query_allocate(nvc0, q, 0);
-         return FALSE;
+         return false;
       }
       q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
    }
-   return TRUE;
+   return true;
 }
 
 static void
@@ -126,17 +126,17 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
       space = NVC0_QUERY_ALLOC_SPACE;
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       space = 512;
       break;
    case PIPE_QUERY_SO_STATISTICS:
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       space = 64;
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      q->is64bit = TRUE;
+      q->is64bit = true;
       q->index = index;
       space = 32;
       break;
@@ -257,11 +257,11 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_query *q = nvc0_query(pq);
-   boolean ret = true;
+   bool ret = true;
 
    /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to FALSE even *after* we re-
-    * initialized it to TRUE.
+    * query might set the initial render conition to false even *after* we re-
+    * initialized it to true.
     */
    if (q->rotate) {
       nvc0_query_rotate(nvc0, q);
@@ -270,7 +270,7 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
        *  query ?
        */
       q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = TRUE */
+      q->data[1] = 1; /* initial render condition = true */
       q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
       q->data[5] = 0;
    }
@@ -401,7 +401,7 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to FALSE */
+      /* This query is not issued on GPU because disjoint is forced to false */
       q->state = NVC0_QUERY_STATE_READY;
       break;
    default:
@@ -422,7 +422,7 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence);
 }
 
-static INLINE void
+static inline void
 nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q)
 {
    if (q->is64bit) {
@@ -442,7 +442,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nvc0_query *q = nvc0_query(pq);
    uint64_t *res64 = (uint64_t*)result;
    uint32_t *res32 = (uint32_t*)result;
-   boolean *res8 = (boolean*)result;
+   uint8_t *res8 = (uint8_t*)result;
    uint64_t *data64 = (uint64_t *)q->data;
    unsigned i;
 
@@ -450,7 +450,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
        q->type <= NVC0_QUERY_DRV_STAT_LAST) {
       res64[0] = q->u.value;
-      return TRUE;
+      return true;
    } else
 #endif
    if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
@@ -468,17 +468,17 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
             /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
             PUSH_KICK(nvc0->base.pushbuf);
          }
-         return FALSE;
+         return false;
       }
       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
-         return FALSE;
+         return false;
       NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
    }
    q->state = NVC0_QUERY_STATE_READY;
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = TRUE;
+      res8[0] = true;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
       res64[0] = q->data[1] - q->data[5];
@@ -502,7 +502,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       res64[0] = 1000000000;
-      res8[8] = FALSE;
+      res8[8] = false;
       break;
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
@@ -516,10 +516,10 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    default:
       assert(0); /* can't happen, we don't create queries with invalid type */
-      return FALSE;
+      return false;
    }
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -549,7 +549,7 @@ nvc0_render_condition(struct pipe_context *pipe,
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_query *q;
    uint32_t cond;
-   boolean wait =
+   bool wait =
       mode != PIPE_RENDER_COND_NO_WAIT &&
       mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
 
@@ -563,7 +563,7 @@ nvc0_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
          cond = condition ? NVC0_3D_COND_MODE_EQUAL :
                           NVC0_3D_COND_MODE_NOT_EQUAL;
-         wait = TRUE;
+         wait = true;
          break;
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
@@ -626,12 +626,12 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
 void
 nvc0_so_target_save_offset(struct pipe_context *pipe,
                            struct pipe_stream_output_target *ptarg,
-                           unsigned index, boolean *serialize)
+                           unsigned index, bool *serialize)
 {
    struct nvc0_so_target *targ = nvc0_so_target(ptarg);
 
    if (*serialize) {
-      *serialize = FALSE;
+      *serialize = false;
       PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1);
       IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0);
 
@@ -1080,7 +1080,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
    struct nvc0_screen *screen = nvc0->screen;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
+   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
    const struct nvc0_mp_pm_query_cfg *cfg;
    unsigned i, c;
    unsigned num_ab[2] = { 0, 0 };
@@ -1101,7 +1101,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
    PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6);
 
    if (!screen->pm.mp_counters_enabled) {
-      screen->pm.mp_counters_enabled = TRUE;
+      screen->pm.mp_counters_enabled = true;
       BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
       PUSH_DATA (push, 0x1fcb);
    }
@@ -1168,7 +1168,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    struct nvc0_screen *screen = nvc0->screen;
    struct pipe_context *pipe = &nvc0->base.pipe;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
+   const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
    uint32_t mask;
    uint32_t input[3];
    const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
@@ -1181,7 +1181,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    if (unlikely(!screen->pm.prog)) {
       struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
       prog->type = PIPE_SHADER_COMPUTE;
-      prog->translated = TRUE;
+      prog->translated = true;
       prog->num_gprs = 14;
       prog->parm_size = 12;
       if (is_nve4) {
@@ -1249,9 +1249,9 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    }
 }
 
-static INLINE boolean
+static inline bool
 nvc0_mp_pm_query_read_data(uint32_t count[32][4],
-                           struct nvc0_context *nvc0, boolean wait,
+                           struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
                            const struct nvc0_mp_pm_query_cfg *cfg,
                            unsigned mp_count)
@@ -1264,19 +1264,19 @@ nvc0_mp_pm_query_read_data(uint32_t count[32][4],
       for (c = 0; c < cfg->num_counters; ++c) {
          if (q->data[b + 8] != q->sequence) {
             if (!wait)
-               return FALSE;
+               return false;
             if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
-               return FALSE;
+               return false;
          }
          count[p][c] = q->data[b + q->ctr[c]];
       }
    }
-   return TRUE;
+   return true;
 }
 
-static INLINE boolean
+static inline bool
 nve4_mp_pm_query_read_data(uint32_t count[32][4],
-                           struct nvc0_context *nvc0, boolean wait,
+                           struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
                            const struct nvc0_mp_pm_query_cfg *cfg,
                            unsigned mp_count)
@@ -1291,9 +1291,9 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4],
          for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) {
             if (q->data[b + 20 + d] != q->sequence) {
                if (!wait)
-                  return FALSE;
+                  return false;
                if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
-                  return FALSE;
+                  return false;
             }
             if (q->ctr[c] & ~0x3)
                count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)];
@@ -1302,7 +1302,7 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4],
          }
       }
    }
-   return TRUE;
+   return true;
 }
 
 /* Metric calculations:
@@ -1325,7 +1325,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
    unsigned p, c;
    const struct nvc0_mp_pm_query_cfg *cfg;
-   boolean ret;
+   bool ret;
 
    cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
 
@@ -1334,7 +1334,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    else
       ret = nvc0_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
    if (!ret)
-      return FALSE;
+      return false;
 
    if (cfg->op == NVC0_COUNTER_OPn_SUM) {
       for (c = 0; c < cfg->num_counters; ++c)
@@ -1394,7 +1394,7 @@ nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
    }
 
    *(uint64_t *)result = value;
-   return TRUE;
+   return true;
 }
 
 int
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 56c230e42fc..ab19b26f156 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -44,16 +44,16 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
                                 unsigned bindings)
 {
    if (sample_count > 8)
-      return FALSE;
+      return false;
    if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
-      return FALSE;
+      return false;
 
    if (!util_format_is_supported(format, bindings))
-      return FALSE;
+      return false;
 
    if ((bindings & PIPE_BIND_SAMPLER_VIEW) && (target != PIPE_BUFFER))
       if (util_format_get_blocksizebits(format) == 3 * 32)
-         return FALSE;
+         return false;
 
    /* transfers & shared are always supported */
    bindings &= ~(PIPE_BIND_TRANSFER_READ |
@@ -120,6 +120,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50;
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_LITTLE;
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return 30;
 
    /* supported caps */
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -163,7 +165,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_USER_CONSTANT_BUFFERS:
    case PIPE_CAP_USER_INDEX_BUFFERS:
    case PIPE_CAP_USER_VERTEX_BUFFERS:
-   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
    case PIPE_CAP_SAMPLE_SHADING:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
@@ -174,11 +175,16 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_COMPUTE:
       return (class_3d == NVE4_3D_CLASS) ? 1 : 0;
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
 
    /* unsupported caps */
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
@@ -226,13 +232,14 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 
    switch (shader) {
    case PIPE_SHADER_VERTEX:
-      /*
-   case PIPE_SHADER_TESSELLATION_CONTROL:
-   case PIPE_SHADER_TESSELLATION_EVALUATION:
-      */
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
       break;
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+      if (class_3d >= GM107_3D_CLASS)
+         return 0;
+      break;
    case PIPE_SHADER_COMPUTE:
       if (class_3d != NVE4_3D_CLASS)
          return 0;
@@ -341,6 +348,7 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
                               enum pipe_compute_cap param, void *data)
 {
    uint64_t *data64 = (uint64_t *)data;
+   uint32_t *data32 = (uint32_t *)data;
    const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass;
 
    switch (param) {
@@ -372,6 +380,9 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
    case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
       data64[0] = 4096;
       return 8;
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      data32[0] = 32;
+      return 4;
    default:
       return 0;
    }
@@ -550,7 +561,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
       /* Using COMPUTE has weird effects on 3D state, we need to
        * investigate this further before enabling it by default.
        */
-      if (debug_get_bool_option("NVC0_COMPUTE", FALSE))
+      if (debug_get_bool_option("NVC0_COMPUTE", false))
          return nvc0_screen_compute_setup(screen, screen->base.pushbuf);
       return 0;
    case 0xe0:
@@ -564,7 +575,7 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
    }
 }
 
-boolean
+bool
 nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
                             uint32_t lpos, uint32_t lneg, uint32_t cstack)
 {
@@ -574,7 +585,7 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
 
    if (size >= (1 << 20)) {
       NOUVEAU_ERR("requested TLS size too large: 0x%"PRIx64"\n", size);
-      return FALSE;
+      return false;
    }
 
    size *= (screen->base.device->chipset >= 0xe0) ? 64 : 48; /* max warps */
@@ -587,11 +598,11 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
                         NULL, &bo);
    if (ret) {
       NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size);
-      return FALSE;
+      return false;
    }
    nouveau_bo_ref(NULL, &screen->tls);
    screen->tls = bo;
-   return TRUE;
+   return true;
 }
 
 #define FAIL_SCREEN_INIT(str, err)                    \
@@ -610,6 +621,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    struct nouveau_pushbuf *push;
    uint64_t value;
    uint32_t obj_class;
+   uint32_t flags;
    int ret;
    unsigned i;
 
@@ -665,8 +677,11 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.base.get_video_param = nouveau_vp3_screen_get_video_param;
    screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported;
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0, 4096, NULL,
-                        &screen->fence.bo);
+   flags = NOUVEAU_BO_GART | NOUVEAU_BO_MAP;
+   if (dev->drm_version >= 0x01000202)
+      flags |= NOUVEAU_BO_COHERENT;
+
+   ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo);
    if (ret)
       goto fail;
    nouveau_bo_map(screen->fence.bo, 0, NULL);
@@ -781,7 +796,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    BEGIN_NVC0(push, NVC0_3D(COND_MODE), 1);
    PUSH_DATA (push, NVC0_3D_COND_MODE_ALWAYS);
 
-   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) {
+   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", true)) {
       /* kill shaders after about 1 second (at 100 MHz) */
       BEGIN_NVC0(push, NVC0_3D(WATCHDOG_TIMER), 1);
       PUSH_DATA (push, 0x17);
@@ -1012,6 +1027,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATA (push, 0x20);
    BEGIN_NVC0(push, NVC0_3D(SP_SELECT(0)), 1);
    PUSH_DATA (push, 0x00);
+   screen->save_state.patch_vertices = 3;
 
    BEGIN_NVC0(push, NVC0_3D(POINT_COORD_REPLACE), 1);
    PUSH_DATA (push, 0);
@@ -1031,7 +1047,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    if (!nvc0_blitter_create(screen))
       goto fail;
 
-   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index ef2bd43f006..d8826ae0c0d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -28,16 +28,17 @@ struct nvc0_context;
 struct nvc0_blitter;
 
 struct nvc0_graph_state {
-   boolean flushed;
-   boolean rasterizer_discard;
-   boolean early_z_forced;
-   boolean prim_restart;
+   bool flushed;
+   bool rasterizer_discard;
+   bool early_z_forced;
+   bool prim_restart;
    uint32_t instance_elts; /* bitmask of per-instance elements */
    uint32_t instance_base;
    uint32_t constant_vbos;
    uint32_t constant_elts;
    int32_t index_bias;
    uint16_t scissor;
+   uint8_t patch_vertices;
    uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
    uint8_t num_vtxbufs;
    uint8_t num_vtxelts;
@@ -95,7 +96,7 @@ struct nvc0_screen {
       struct nvc0_program *prog; /* compute state object to read MP counters */
       struct pipe_query *mp_counter[8]; /* counter to query allocation */
       uint8_t num_mp_pm_active[2];
-      boolean mp_counters_enabled;
+      bool mp_counters_enabled;
    } pm;
 
    struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
@@ -105,7 +106,7 @@ struct nvc0_screen {
    struct nouveau_object *nvsw;
 };
 
-static INLINE struct nvc0_screen *
+static inline struct nvc0_screen *
 nvc0_screen(struct pipe_screen *screen)
 {
    return (struct nvc0_screen *)screen;
@@ -276,7 +277,7 @@ int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
 int nvc0_screen_get_driver_query_group_info(struct pipe_screen *, unsigned,
                                             struct pipe_driver_query_group_info *);
 
-boolean nvc0_blitter_create(struct nvc0_screen *);
+bool nvc0_blitter_create(struct nvc0_screen *);
 void nvc0_blitter_destroy(struct nvc0_screen *);
 
 void nvc0_screen_make_buffers_resident(struct nvc0_screen *);
@@ -287,10 +288,10 @@ int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *);
 int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
 int nvc0_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
 
-boolean nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
-                                    uint32_t lneg, uint32_t cstack);
+bool nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
+                                 uint32_t lneg, uint32_t cstack);
 
-static INLINE void
+static inline void
 nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
    struct nvc0_screen *screen = nvc0_screen(res->base.screen);
@@ -302,7 +303,7 @@ nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_resource_validate(struct nv04_resource *res, uint32_t flags)
 {
    if (likely(res->bo)) {
@@ -325,21 +326,21 @@ struct nvc0_format {
 
 extern const struct nvc0_format nvc0_format_table[];
 
-static INLINE void
+static inline void
 nvc0_screen_tic_unlock(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0)
       screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tsc_unlock(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0)
       screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
 {
    if (tic->id >= 0) {
@@ -348,7 +349,7 @@ nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_screen_tsc_free(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
 {
    if (tsc->id >= 0) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index e0842784a88..8aa127adc0a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -27,7 +27,7 @@
 
 #include "nvc0/nvc0_context.h"
 
-static INLINE void
+static inline void
 nvc0_program_update_context_state(struct nvc0_context *nvc0,
                                   struct nvc0_program *prog, int stage)
 {
@@ -63,22 +63,22 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0,
    }
 }
 
-static INLINE boolean
+static inline bool
 nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
 {
    if (prog->mem)
-      return TRUE;
+      return true;
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
          prog, nvc0->screen->base.device->chipset);
       if (!prog->translated)
-         return FALSE;
+         return false;
    }
 
    if (likely(prog->code_size))
       return nvc0_program_upload_code(nvc0, prog);
-   return TRUE; /* stream output info only */
+   return true; /* stream output info only */
 }
 
 void
@@ -147,9 +147,6 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
       PUSH_DATA (push, tp->code_base);
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
       PUSH_DATA (push, tp->num_gprs);
-
-      if (tp->tp.input_patch_size <= 32)
-         IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), tp->tp.input_patch_size);
    } else {
       BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
       PUSH_DATA (push, 0x20);
@@ -192,7 +189,7 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
 
    /* we allow GPs with no code for specifying stream output state only */
    if (gp && gp->code_size) {
-      const boolean gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
+      const bool gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
 
       BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
       PUSH_DATA (push, 0x41);
@@ -280,7 +277,7 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
          nvc0_query_pushbuf_submit(push, targ->pq, 0x4);
       } else {
          PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */
-         targ->clean = FALSE;
+         targ->clean = false;
       }
    }
    for (; b < 4; ++b)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 6b7a211e71b..2a33857d9df 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -35,7 +35,7 @@
 
 #include "nouveau_gldefs.h"
 
-static INLINE uint32_t
+static inline uint32_t
 nvc0_colormask(unsigned mask)
 {
     uint32_t ret = 0;
@@ -55,7 +55,7 @@ nvc0_colormask(unsigned mask)
 #define NVC0_BLEND_FACTOR_CASE(a, b) \
    case PIPE_BLENDFACTOR_##a: return NV50_BLEND_FACTOR_##b
 
-static INLINE uint32_t
+static inline uint32_t
 nvc0_blend_fac(unsigned factor)
 {
    switch (factor) {
@@ -92,8 +92,8 @@ nvc0_blend_state_create(struct pipe_context *pipe,
    int r; /* reference */
    uint32_t ms;
    uint8_t blend_en = 0;
-   boolean indep_masks = FALSE;
-   boolean indep_funcs = FALSE;
+   bool indep_masks = false;
+   bool indep_funcs = false;
 
    so->pipe = *cso;
 
@@ -111,7 +111,7 @@ nvc0_blend_state_create(struct pipe_context *pipe,
              cso->rt[i].alpha_func != cso->rt[r].alpha_func ||
              cso->rt[i].alpha_src_factor != cso->rt[r].alpha_src_factor ||
              cso->rt[i].alpha_dst_factor != cso->rt[r].alpha_dst_factor) {
-            indep_funcs = TRUE;
+            indep_funcs = true;
             break;
          }
       }
@@ -120,7 +120,7 @@ nvc0_blend_state_create(struct pipe_context *pipe,
 
       for (i = 1; i < 8; ++i) {
          if (cso->rt[i].colormask != cso->rt[0].colormask) {
-            indep_masks = TRUE;
+            indep_masks = true;
             break;
          }
       }
@@ -351,6 +351,13 @@ nvc0_zsa_state_create(struct pipe_context *pipe,
       SB_DATA    (so, nvgl_comparison_op(cso->depth.func));
    }
 
+   SB_IMMED_3D(so, DEPTH_BOUNDS_EN, cso->depth.bounds_test);
+   if (cso->depth.bounds_test) {
+      SB_BEGIN_3D(so, DEPTH_BOUNDS(0), 2);
+      SB_DATA    (so, fui(cso->depth.bounds_min));
+      SB_DATA    (so, fui(cso->depth.bounds_max));
+   }
+
    if (cso->stencil[0].enabled) {
       SB_BEGIN_3D(so, STENCIL_ENABLE, 5);
       SB_DATA    (so, 1);
@@ -428,7 +435,7 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
    FREE(hwcso);
 }
 
-static INLINE void
+static inline void
 nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0, int s,
                                unsigned nr, void **hwcso)
 {
@@ -508,6 +515,14 @@ nvc0_bind_sampler_states(struct pipe_context *pipe, unsigned shader,
       assert(start == 0);
       nvc0_stage_sampler_states_bind(nvc0_context(pipe), 0, nr, s);
       break;
+   case PIPE_SHADER_TESS_CTRL:
+      assert(start == 0);
+      nvc0_stage_sampler_states_bind(nvc0_context(pipe), 1, nr, s);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      assert(start == 0);
+      nvc0_stage_sampler_states_bind(nvc0_context(pipe), 2, nr, s);
+      break;
    case PIPE_SHADER_GEOMETRY:
       assert(start == 0);
       nvc0_stage_sampler_states_bind(nvc0_context(pipe), 3, nr, s);
@@ -537,7 +552,7 @@ nvc0_sampler_view_destroy(struct pipe_context *pipe,
    FREE(nv50_tic_entry(view));
 }
 
-static INLINE void
+static inline void
 nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
                              unsigned nr,
                              struct pipe_sampler_view **views)
@@ -633,6 +648,12 @@ nvc0_set_sampler_views(struct pipe_context *pipe, unsigned shader,
    case PIPE_SHADER_VERTEX:
       nvc0_stage_set_sampler_views(nvc0_context(pipe), 0, nr, views);
       break;
+   case PIPE_SHADER_TESS_CTRL:
+      nvc0_stage_set_sampler_views(nvc0_context(pipe), 1, nr, views);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      nvc0_stage_set_sampler_views(nvc0_context(pipe), 2, nr, views);
+      break;
    case PIPE_SHADER_GEOMETRY:
       nvc0_stage_set_sampler_views(nvc0_context(pipe), 3, nr, views);
       break;
@@ -733,6 +754,38 @@ nvc0_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     nvc0->dirty |= NVC0_NEW_GMTYPROG;
 }
 
+static void *
+nvc0_tcp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_TESS_CTRL);
+}
+
+static void
+nvc0_tcp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->tctlprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_TCTLPROG;
+}
+
+static void *
+nvc0_tep_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_TESS_EVAL);
+}
+
+static void
+nvc0_tep_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->tevlprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_TEVLPROG;
+}
+
 static void *
 nvc0_cp_state_create(struct pipe_context *pipe,
                      const struct pipe_compute_state *cso)
@@ -790,7 +843,7 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
    pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, res);
 
-   nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
+   nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? true : false;
    if (nvc0->constbuf[s][i].user) {
       nvc0->constbuf[s][i].u.data = cb->user_buffer;
       nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
@@ -933,6 +986,18 @@ nvc0_set_viewport_states(struct pipe_context *pipe,
 
 }
 
+static void
+nvc0_set_tess_state(struct pipe_context *pipe,
+                    const float default_tess_outer[4],
+                    const float default_tess_inner[2])
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+   memcpy(nvc0->default_tess_outer, default_tess_outer, 4 * sizeof(float));
+   memcpy(nvc0->default_tess_inner, default_tess_inner, 2 * sizeof(float));
+   nvc0->dirty |= NVC0_NEW_TESSFACTOR;
+}
+
 static void
 nvc0_set_vertex_buffers(struct pipe_context *pipe,
                         unsigned start_slot, unsigned count,
@@ -1018,7 +1083,7 @@ nvc0_so_target_create(struct pipe_context *pipe,
       FREE(targ);
       return NULL;
    }
-   targ->clean = TRUE;
+   targ->clean = true;
 
    targ->pipe.buffer_size = size;
    targ->pipe.buffer_offset = offset;
@@ -1051,13 +1116,13 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    unsigned i;
-   boolean serialize = TRUE;
+   bool serialize = true;
 
    assert(num_targets <= 4);
 
    for (i = 0; i < num_targets; ++i) {
-      const boolean changed = nvc0->tfbbuf[i] != targets[i];
-      const boolean append = (offsets[i] == ((unsigned)-1));
+      const bool changed = nvc0->tfbbuf[i] != targets[i];
+      const bool append = (offsets[i] == ((unsigned)-1));
       if (!changed && append)
          continue;
       nvc0->tfbbuf_dirty |= 1 << i;
@@ -1066,7 +1131,7 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
          nvc0_so_target_save_offset(pipe, nvc0->tfbbuf[i], i, &serialize);
 
       if (targets[i] && !append)
-         nvc0_so_target(targets[i])->clean = TRUE;
+         nvc0_so_target(targets[i])->clean = true;
 
       pipe_so_target_reference(&nvc0->tfbbuf[i], targets[i]);
    }
@@ -1125,16 +1190,18 @@ nvc0_set_compute_resources(struct pipe_context *pipe,
 }
 
 static void
-nvc0_set_shader_resources(struct pipe_context *pipe,
-                          unsigned start, unsigned nr,
-                          struct pipe_surface **resources)
+nvc0_set_shader_images(struct pipe_context *pipe, unsigned shader,
+                       unsigned start_slot, unsigned count,
+                       struct pipe_image_view **views)
 {
-   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, resources);
+#if 0
+   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, views);
 
    nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES;
+#endif
 }
 
-static INLINE void
+static inline void
 nvc0_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
 {
    struct nv04_resource *buf = nv04_resource(res);
@@ -1218,12 +1285,18 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->create_vs_state = nvc0_vp_state_create;
    pipe->create_fs_state = nvc0_fp_state_create;
    pipe->create_gs_state = nvc0_gp_state_create;
+   pipe->create_tcs_state = nvc0_tcp_state_create;
+   pipe->create_tes_state = nvc0_tep_state_create;
    pipe->bind_vs_state = nvc0_vp_state_bind;
    pipe->bind_fs_state = nvc0_fp_state_bind;
    pipe->bind_gs_state = nvc0_gp_state_bind;
+   pipe->bind_tcs_state = nvc0_tcp_state_bind;
+   pipe->bind_tes_state = nvc0_tep_state_bind;
    pipe->delete_vs_state = nvc0_sp_state_delete;
    pipe->delete_fs_state = nvc0_sp_state_delete;
    pipe->delete_gs_state = nvc0_sp_state_delete;
+   pipe->delete_tcs_state = nvc0_sp_state_delete;
+   pipe->delete_tes_state = nvc0_sp_state_delete;
 
    pipe->create_compute_state = nvc0_cp_state_create;
    pipe->bind_compute_state = nvc0_cp_state_bind;
@@ -1239,6 +1312,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->set_polygon_stipple = nvc0_set_polygon_stipple;
    pipe->set_scissor_states = nvc0_set_scissor_states;
    pipe->set_viewport_states = nvc0_set_viewport_states;
+   pipe->set_tess_state = nvc0_set_tess_state;
 
    pipe->create_vertex_elements_state = nvc0_vertex_state_create;
    pipe->delete_vertex_elements_state = nvc0_vertex_state_delete;
@@ -1253,8 +1327,14 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
 
    pipe->set_global_binding = nvc0_set_global_bindings;
    pipe->set_compute_resources = nvc0_set_compute_resources;
-   pipe->set_shader_resources = nvc0_set_shader_resources;
+   pipe->set_shader_images = nvc0_set_shader_images;
 
    nvc0->sample_mask = ~0;
    nvc0->min_samples = 1;
+   nvc0->default_tess_outer[0] =
+   nvc0->default_tess_outer[1] =
+   nvc0->default_tess_outer[2] =
+   nvc0->default_tess_outer[3] = 1.0;
+   nvc0->default_tess_inner[0] =
+   nvc0->default_tess_inner[1] = 1.0;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index c52399ab312..ce1119c284d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -55,7 +55,7 @@ nvc0_validate_zcull(struct nvc0_context *nvc0)
 }
 #endif
 
-static INLINE void
+static inline void
 nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i)
 {
    BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 6);
@@ -74,7 +74,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
     struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
     unsigned i, ms;
     unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
-    boolean serialize = FALSE;
+    bool serialize = false;
 
     nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
 
@@ -136,7 +136,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
         }
 
         if (res->status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-           serialize = TRUE;
+           serialize = true;
         res->status |=  NOUVEAU_BUFFER_STATUS_GPU_WRITING;
         res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -168,7 +168,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
         ms_mode = mt->ms_mode;
 
         if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
-           serialize = TRUE;
+           serialize = true;
         mt->base.status |=  NOUVEAU_BUFFER_STATUS_GPU_WRITING;
         mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
@@ -309,7 +309,7 @@ nvc0_validate_viewport(struct nvc0_context *nvc0)
    nvc0->viewports_dirty = 0;
 }
 
-static INLINE void
+static inline void
 nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -324,7 +324,7 @@ nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
    PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
 }
 
-static INLINE void
+static inline void
 nvc0_check_program_ucps(struct nvc0_context *nvc0,
                         struct nvc0_program *vp, uint8_t mask)
 {
@@ -339,7 +339,7 @@ nvc0_check_program_ucps(struct nvc0_context *nvc0,
       nvc0_vertprog_validate(nvc0);
    else
    if (likely(vp == nvc0->gmtyprog))
-      nvc0_vertprog_validate(nvc0);
+      nvc0_gmtyprog_validate(nvc0);
    else
       nvc0_tevlprog_validate(nvc0);
 }
@@ -455,6 +455,8 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
                PUSH_DATA (push, (i << 4) | 1);
 
                BCTX_REFN(nvc0->bufctx_3d, CB(s, i), res, RD);
+
+               nvc0->cb_dirty = 1; /* Force cache flush for UBO. */
             } else {
                BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
                PUSH_DATA (push, (i << 4) | 0);
@@ -518,12 +520,12 @@ static void
 nvc0_validate_derived_1(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   boolean rasterizer_discard;
+   bool rasterizer_discard;
 
    if (nvc0->rast && nvc0->rast->pipe.rasterizer_discard) {
-      rasterizer_discard = TRUE;
+      rasterizer_discard = true;
    } else {
-      boolean zs = nvc0->zsa &&
+      bool zs = nvc0->zsa &&
          (nvc0->zsa->pipe.depth.enabled || nvc0->zsa->pipe.stencil[0].enabled);
       rasterizer_discard = !zs &&
          (!nvc0->fragprog || !nvc0->fragprog->hdr[18]);
@@ -535,6 +537,33 @@ nvc0_validate_derived_1(struct nvc0_context *nvc0)
    }
 }
 
+/* alpha test is disabled if there are no color RTs, so make sure we have at
+ * least one if alpha test is enabled. Note that this must run after
+ * nvc0_validate_fb, otherwise that will override the RT count setting.
+ */
+static void
+nvc0_validate_derived_2(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   if (nvc0->zsa && nvc0->zsa->pipe.alpha.enabled &&
+       nvc0->framebuffer.nr_cbufs == 0) {
+      nvc0_fb_set_null_rt(push, 0);
+      BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
+      PUSH_DATA (push, (076543210 << 4) | 1);
+   }
+}
+
+static void
+nvc0_validate_tess_state(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   BEGIN_NVC0(push, NVC0_3D(TESS_LEVEL_OUTER(0)), 6);
+   PUSH_DATAp(push, nvc0->default_tess_outer, 4);
+   PUSH_DATAp(push, nvc0->default_tess_inner, 2);
+}
+
 static void
 nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
 {
@@ -593,10 +622,12 @@ static struct state_validate {
     { nvc0_vertprog_validate,      NVC0_NEW_VERTPROG },
     { nvc0_tctlprog_validate,      NVC0_NEW_TCTLPROG },
     { nvc0_tevlprog_validate,      NVC0_NEW_TEVLPROG },
+    { nvc0_validate_tess_state,    NVC0_NEW_TESSFACTOR },
     { nvc0_gmtyprog_validate,      NVC0_NEW_GMTYPROG },
     { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG },
     { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
                                    NVC0_NEW_RASTERIZER },
+    { nvc0_validate_derived_2,     NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER },
     { nvc0_validate_clip,          NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER |
                                    NVC0_NEW_VERTPROG |
                                    NVC0_NEW_TEVLPROG |
@@ -613,7 +644,7 @@ static struct state_validate {
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
 
-boolean
+bool
 nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words)
 {
    uint32_t state_mask;
@@ -634,15 +665,15 @@ nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words)
       }
       nvc0->dirty &= ~state_mask;
 
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, FALSE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, false);
    }
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_3d);
    ret = nouveau_pushbuf_validate(nvc0->base.pushbuf);
 
    if (unlikely(nvc0->state.flushed)) {
-      nvc0->state.flushed = FALSE;
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, TRUE);
+      nvc0->state.flushed = false;
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, true);
    }
    return !ret;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 1d70b7c7b23..18fcc12dea3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -29,7 +29,7 @@ struct nvc0_rasterizer_stateobj {
 struct nvc0_zsa_stateobj {
    struct pipe_depth_stencil_alpha_state pipe;
    int size;
-   uint32_t state[26];
+   uint32_t state[30];
 };
 
 struct nvc0_constbuf {
@@ -39,7 +39,7 @@ struct nvc0_constbuf {
    } u;
    uint32_t size;
    uint32_t offset;
-   boolean user; /* should only be TRUE if u.data is valid and non-NULL */
+   bool user; /* should only be true if u.data is valid and non-NULL */
 };
 
 struct nvc0_vertex_element {
@@ -55,8 +55,8 @@ struct nvc0_vertex_stateobj {
    unsigned num_elements;
    uint32_t instance_elts;
    uint32_t instance_bufs;
-   boolean shared_slots;
-   boolean need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */
+   bool shared_slots;
+   bool need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */
    unsigned size; /* size of vertex in bytes (when packed) */
    struct nvc0_vertex_element element[0];
 };
@@ -65,10 +65,10 @@ struct nvc0_so_target {
    struct pipe_stream_output_target pipe;
    struct pipe_query *pq;
    unsigned stride;
-   boolean clean;
+   bool clean;
 };
 
-static INLINE struct nvc0_so_target *
+static inline struct nvc0_so_target *
 nvc0_so_target(struct pipe_stream_output_target *ptarg)
 {
    return (struct nvc0_so_target *)ptarg;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index a820de7259a..51a6f93f891 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -47,8 +47,8 @@
 #define NOUVEAU_DRIVER 0xc0
 #include "nv50/nv50_blit.h"
 
-static INLINE uint8_t
-nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
+static inline uint8_t
+nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
 {
    uint8_t id = nvc0_format_table[format].rt;
 
@@ -81,9 +81,9 @@ nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
 }
 
 static int
-nvc0_2d_texture_set(struct nouveau_pushbuf *push, boolean dst,
+nvc0_2d_texture_set(struct nouveau_pushbuf *push, bool dst,
                     struct nv50_miptree *mt, unsigned level, unsigned layer,
-                    enum pipe_format pformat, boolean dst_src_pformat_equal)
+                    enum pipe_format pformat, bool dst_src_pformat_equal)
 {
    struct nouveau_bo *bo = mt->base.bo;
    uint32_t width, height, depth;
@@ -161,16 +161,16 @@ nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push,
    const enum pipe_format dfmt = dst->base.base.format;
    const enum pipe_format sfmt = src->base.base.format;
    int ret;
-   boolean eqfmt = dfmt == sfmt;
+   bool eqfmt = dfmt == sfmt;
 
    if (!PUSH_SPACE(push, 2 * 16 + 32))
       return PIPE_ERROR;
 
-   ret = nvc0_2d_texture_set(push, TRUE, dst, dst_level, dz, dfmt, eqfmt);
+   ret = nvc0_2d_texture_set(push, true, dst, dst_level, dz, dfmt, eqfmt);
    if (ret)
       return ret;
 
-   ret = nvc0_2d_texture_set(push, FALSE, src, src_level, sz, sfmt, eqfmt);
+   ret = nvc0_2d_texture_set(push, false, src, src_level, sz, sfmt, eqfmt);
    if (ret)
       return ret;
 
@@ -189,7 +189,7 @@ nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push,
    PUSH_DATA (push, 0);
    PUSH_DATA (push, sx << src->ms_x);
    PUSH_DATA (push, 0);
-   PUSH_DATA (push, sy << src->ms_x);
+   PUSH_DATA (push, sy << src->ms_y);
 
    return 0;
 }
@@ -203,7 +203,7 @@ nvc0_resource_copy_region(struct pipe_context *pipe,
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    int ret;
-   boolean m2mf;
+   bool m2mf;
    unsigned dst_layer = dstz, src_layer = src_box->z;
 
    if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
@@ -704,7 +704,7 @@ nvc0_blitter_make_vp(struct nvc0_blitter *blit)
    };
 
    blit->vp.type = PIPE_SHADER_VERTEX;
-   blit->vp.translated = TRUE;
+   blit->vp.translated = true;
    if (blit->screen->base.class_3d >= GM107_3D_CLASS) {
       blit->vp.code = (uint32_t *)code_gm107; /* const_cast */
       blit->vp.code_size = sizeof(code_gm107);
@@ -1217,7 +1217,7 @@ nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
    int i;
    uint32_t mode;
    uint32_t mask = nv50_blit_eng2d_get_mask(info);
-   boolean b;
+   bool b;
 
    mode = nv50_blit_get_filter(info) ?
       NV50_2D_BLIT_CONTROL_FILTER_BILINEAR :
@@ -1376,39 +1376,40 @@ static void
 nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
-   boolean eng3d = FALSE;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   bool eng3d = false;
 
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
       if (!(info->mask & PIPE_MASK_ZS))
          return;
       if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT ||
           info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
-         eng3d = TRUE;
+         eng3d = true;
       if (info->filter != PIPE_TEX_FILTER_NEAREST)
-         eng3d = TRUE;
+         eng3d = true;
    } else {
       if (!(info->mask & PIPE_MASK_RGBA))
          return;
       if (info->mask != PIPE_MASK_RGBA)
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (nv50_miptree(info->src.resource)->layout_3d) {
-      eng3d = TRUE;
+      eng3d = true;
    } else
    if (info->src.box.depth != info->dst.box.depth) {
-      eng3d = TRUE;
+      eng3d = true;
       debug_printf("blit: cannot filter array or cube textures in z direction");
    }
 
    if (!eng3d && info->dst.format != info->src.format) {
       if (!nv50_2d_dst_format_faithful(info->dst.format)) {
-         eng3d = TRUE;
+         eng3d = true;
       } else
       if (!nv50_2d_src_format_faithful(info->src.format)) {
          if (!util_format_is_luminance(info->src.format)) {
             if (!nv50_2d_dst_format_ops_supported(info->dst.format))
-               eng3d = TRUE;
+               eng3d = true;
             else
             if (util_format_is_intensity(info->src.format))
                eng3d = info->src.format != PIPE_FORMAT_I8_UNORM;
@@ -1420,30 +1421,36 @@ nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
          }
       } else
       if (util_format_is_luminance_alpha(info->src.format))
-         eng3d = TRUE;
+         eng3d = true;
    }
 
    if (info->src.resource->nr_samples == 8 &&
        info->dst.resource->nr_samples <= 1)
-      eng3d = TRUE;
+      eng3d = true;
 #if 0
    /* FIXME: can't make this work with eng2d anymore, at least not on nv50 */
    if (info->src.resource->nr_samples > 1 ||
        info->dst.resource->nr_samples > 1)
-      eng3d = TRUE;
+      eng3d = true;
 #endif
    /* FIXME: find correct src coordinates adjustments */
    if ((info->src.box.width !=  info->dst.box.width &&
         info->src.box.width != -info->dst.box.width) ||
        (info->src.box.height !=  info->dst.box.height &&
         info->src.box.height != -info->dst.box.height))
-      eng3d = TRUE;
+      eng3d = true;
+
+   if (nvc0->screen->num_occlusion_queries_active)
+      IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
 
    if (!eng3d)
       nvc0_blit_eng2d(nvc0, info);
    else
       nvc0_blit_3d(nvc0, info);
 
+   if (nvc0->screen->num_occlusion_queries_active)
+      IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+
    NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_blit_count, 1);
 }
 
@@ -1453,13 +1460,13 @@ nvc0_flush_resource(struct pipe_context *ctx,
 {
 }
 
-boolean
+bool
 nvc0_blitter_create(struct nvc0_screen *screen)
 {
    screen->blitter = CALLOC_STRUCT(nvc0_blitter);
    if (!screen->blitter) {
       NOUVEAU_ERR("failed to allocate blitter struct\n");
-      return FALSE;
+      return false;
    }
    screen->blitter->screen = screen;
 
@@ -1468,7 +1475,7 @@ nvc0_blitter_create(struct nvc0_screen *screen)
    nvc0_blitter_make_vp(screen->blitter);
    nvc0_blitter_make_sampler(screen->blitter);
 
-   return TRUE;
+   return true;
 }
 
 void
@@ -1491,20 +1498,20 @@ nvc0_blitter_destroy(struct nvc0_screen *screen)
    FREE(blitter);
 }
 
-boolean
+bool
 nvc0_blitctx_create(struct nvc0_context *nvc0)
 {
    nvc0->blit = CALLOC_STRUCT(nvc0_blitctx);
    if (!nvc0->blit) {
       NOUVEAU_ERR("failed to allocate blit context\n");
-      return FALSE;
+      return false;
    }
 
    nvc0->blit->nvc0 = nvc0;
 
    nvc0->blit->rast.pipe.half_pixel_center = 1;
 
-   return TRUE;
+   return true;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index ddc0409ca86..d19082e0e15 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -34,8 +34,8 @@
    (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK |   \
     NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
 
-static INLINE uint32_t
-nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int)
+static inline uint32_t
+nv50_tic_swizzle(uint32_t tc, unsigned swz, bool tex_int)
 {
    switch (swz) {
    case PIPE_SWIZZLE_RED:
@@ -82,7 +82,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    uint32_t depth;
    struct nv50_tic_entry *view;
    struct nv50_miptree *mt;
-   boolean tex_int;
+   bool tex_int;
 
    view = MALLOC_STRUCT(nv50_tic_entry);
    if (!view)
@@ -195,7 +195,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    default:
       NOUVEAU_ERR("unexpected/invalid texture target: %d\n",
                   mt->base.base.target);
-      return FALSE;
+      return false;
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
@@ -226,7 +226,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
    return &view->pipe;
 }
 
-static boolean
+static bool
 nvc0_validate_tic(struct nvc0_context *nvc0, int s)
 {
    uint32_t commands[32];
@@ -234,12 +234,12 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
    struct nouveau_bo *txc = nvc0->screen->txc;
    unsigned i;
    unsigned n = 0;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          if (dirty)
@@ -263,7 +263,7 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
          BEGIN_NIC0(push, NVC0_M2MF(DATA), 8);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -295,18 +295,18 @@ nvc0_validate_tic(struct nvc0_context *nvc0, int s)
    return need_flush;
 }
 
-static boolean
+static bool
 nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
 {
    struct nouveau_bo *txc = nvc0->screen->txc;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
@@ -328,7 +328,7 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
          PUSH_DATA (push, 0x1001);
          PUSH_DATAp(push, &tic->tic[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       } else
       if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
          BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
@@ -356,16 +356,14 @@ nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
 
 void nvc0_validate_textures(struct nvc0_context *nvc0)
 {
-   boolean need_flush;
+   bool need_flush = false;
+   int i;
 
-   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
-      need_flush  = nve4_validate_tic(nvc0, 0);
-      need_flush |= nve4_validate_tic(nvc0, 3);
-      need_flush |= nve4_validate_tic(nvc0, 4);
-   } else {
-      need_flush  = nvc0_validate_tic(nvc0, 0);
-      need_flush |= nvc0_validate_tic(nvc0, 3);
-      need_flush |= nvc0_validate_tic(nvc0, 4);
+   for (i = 0; i < 5; i++) {
+      if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+         need_flush |= nve4_validate_tic(nvc0, i);
+      else
+         need_flush |= nvc0_validate_tic(nvc0, i);
    }
 
    if (need_flush) {
@@ -374,14 +372,14 @@ void nvc0_validate_textures(struct nvc0_context *nvc0)
    }
 }
 
-static boolean
+static bool
 nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
 {
    uint32_t commands[16];
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
    unsigned n = 0;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_samplers[s]; ++i) {
       struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]);
@@ -398,7 +396,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
          nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc,
                                65536 + tsc->id * 32, NV_VRAM_DOMAIN(&nvc0->screen->base),
                                32, tsc->tsc);
-         need_flush = TRUE;
+         need_flush = true;
       }
       nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -418,13 +416,13 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
    return need_flush;
 }
 
-boolean
+bool
 nve4_validate_tsc(struct nvc0_context *nvc0, int s)
 {
    struct nouveau_bo *txc = nvc0->screen->txc;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned i;
-   boolean need_flush = FALSE;
+   bool need_flush = false;
 
    for (i = 0; i < nvc0->num_samplers[s]; ++i) {
       struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]);
@@ -447,7 +445,7 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s)
          PUSH_DATA (push, 0x1001);
          PUSH_DATAp(push, &tsc->tsc[0], 8);
 
-         need_flush = TRUE;
+         need_flush = true;
       }
       nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
 
@@ -466,16 +464,14 @@ nve4_validate_tsc(struct nvc0_context *nvc0, int s)
 
 void nvc0_validate_samplers(struct nvc0_context *nvc0)
 {
-   boolean need_flush;
+   bool need_flush = false;
+   int i;
 
-   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
-      need_flush  = nve4_validate_tsc(nvc0, 0);
-      need_flush |= nve4_validate_tsc(nvc0, 3);
-      need_flush |= nve4_validate_tsc(nvc0, 4);
-   } else {
-      need_flush  = nvc0_validate_tsc(nvc0, 0);
-      need_flush |= nvc0_validate_tsc(nvc0, 3);
-      need_flush |= nvc0_validate_tsc(nvc0, 4);
+   for (i = 0; i < 5; i++) {
+      if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+         need_flush |= nve4_validate_tsc(nvc0, i);
+      else
+         need_flush |= nvc0_validate_tsc(nvc0, i);
    }
 
    if (need_flush) {
@@ -645,13 +641,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push,
    }
 }
 
-static INLINE void
+static inline void
 nvc0_update_surface_bindings(struct nvc0_context *nvc0)
 {
    /* TODO */
 }
 
-static INLINE void
+static inline void
 nve4_update_surface_bindings(struct nvc0_context *nvc0)
 {
    /* TODO */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index 45c6f7cc3ca..7cc5b4b1f48 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -329,17 +329,17 @@ nve4_m2mf_copy_linear(struct nouveau_context *nv,
 }
 
 
-static INLINE boolean
+static inline bool
 nvc0_mt_transfer_can_map_directly(struct nv50_miptree *mt)
 {
    if (mt->base.domain == NOUVEAU_BO_VRAM)
-      return FALSE;
+      return false;
    if (mt->base.base.usage != PIPE_USAGE_STAGING)
-      return FALSE;
+      return false;
    return !nouveau_bo_memtype(mt->base.bo);
 }
 
-static INLINE boolean
+static inline bool
 nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
 {
    if (!mt->base.mm) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 8cf2584b0ce..6f9e7906713 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -61,8 +61,8 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
     so->num_elements = num_elements;
     so->instance_elts = 0;
     so->instance_bufs = 0;
-    so->shared_slots = FALSE;
-    so->need_conversion = FALSE;
+    so->shared_slots = false;
+    so->need_conversion = false;
 
     memset(so->vb_access_size, 0, sizeof(so->vb_access_size));
 
@@ -93,7 +93,7 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
                 return NULL;
             }
             so->element[i].state = nvc0_format_table[fmt].vtx;
-            so->need_conversion = TRUE;
+            so->need_conversion = true;
         }
         size = util_format_get_blocksize(fmt);
 
@@ -141,7 +141,7 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
 
     if (so->instance_elts || src_offset_max >= (1 << 14))
        return so;
-    so->shared_slots = TRUE;
+    so->shared_slots = true;
 
     for (i = 0; i < num_elements; ++i) {
        const unsigned b = elements[i].vertex_buffer_index;
@@ -196,7 +196,7 @@ nvc0_set_constant_vertex_attrib(struct nvc0_context *nvc0, const unsigned a)
    push->cur += 5;
 }
 
-static INLINE void
+static inline void
 nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi,
                      uint32_t *base, uint32_t *size)
 {
@@ -214,7 +214,7 @@ nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi,
    }
 }
 
-static INLINE void
+static inline void
 nvc0_release_user_vbufs(struct nvc0_context *nvc0)
 {
    if (nvc0->vbo_user) {
@@ -265,7 +265,7 @@ nvc0_update_user_vbufs(struct nvc0_context *nvc0)
       PUSH_DATAh(push, address[b] + ve->src_offset);
       PUSH_DATA (push, address[b] + ve->src_offset);
    }
-   nvc0->base.vbo_dirty = TRUE;
+   nvc0->base.vbo_dirty = true;
 }
 
 static void
@@ -419,7 +419,7 @@ nvc0_vertex_arrays_validate(struct nvc0_context *nvc0)
    uint32_t const_vbos;
    unsigned i;
    uint8_t vbo_mode;
-   boolean update_vertex;
+   bool update_vertex;
 
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
 
@@ -529,7 +529,7 @@ nvc0_idxbuf_validate(struct nvc0_context *nvc0)
 #define NVC0_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nvc0_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -547,8 +547,7 @@ nvc0_prim_gl(unsigned prim)
    NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
-   /*
-   NVC0_PRIM_GL_CASE(PATCHES); */
+   NVC0_PRIM_GL_CASE(PATCHES);
    default:
       return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
    }
@@ -559,7 +558,7 @@ nvc0_draw_vbo_kick_notify(struct nouveau_pushbuf *push)
 {
    struct nvc0_screen *screen = push->user_priv;
 
-   nouveau_fence_update(&screen->base, TRUE);
+   nouveau_fence_update(&screen->base, true);
 
    NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1);
 }
@@ -695,7 +694,7 @@ nvc0_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
 }
 
 static void
-nvc0_draw_elements(struct nvc0_context *nvc0, boolean shorten,
+nvc0_draw_elements(struct nvc0_context *nvc0, bool shorten,
                    unsigned mode, unsigned start, unsigned count,
                    unsigned instance_count, int32_t index_bias)
 {
@@ -835,8 +834,8 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
                         buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
 }
 
-static INLINE void
-nvc0_update_prim_restart(struct nvc0_context *nvc0, boolean en, uint32_t index)
+static inline void
+nvc0_update_prim_restart(struct nvc0_context *nvc0, bool en, uint32_t index)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 
@@ -889,6 +888,12 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       }
    }
 
+   if (info->mode == PIPE_PRIM_PATCHES &&
+       nvc0->state.patch_vertices != info->vertices_per_patch) {
+      nvc0->state.patch_vertices = info->vertices_per_patch;
+      IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices);
+   }
+
    /* 8 as minimum to avoid immediate double validation of new buffers */
    nvc0_state_validate(nvc0, ~0, 8);
 
@@ -910,13 +915,13 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
             continue;
 
          if (res->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-            nvc0->cb_dirty = TRUE;
+            nvc0->cb_dirty = true;
       }
    }
 
    if (nvc0->cb_dirty) {
       IMMED_NVC0(push, NVC0_3D(MEM_BARRIER), 0x1011);
-      nvc0->cb_dirty = FALSE;
+      nvc0->cb_dirty = false;
    }
 
    if (nvc0->state.vbo_mode) {
@@ -940,19 +945,19 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (!nvc0->vtxbuf[i].buffer)
          continue;
       if (nvc0->vtxbuf[i].buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-         nvc0->base.vbo_dirty = TRUE;
+         nvc0->base.vbo_dirty = true;
    }
 
    if (!nvc0->base.vbo_dirty && nvc0->idxbuf.buffer &&
        nvc0->idxbuf.buffer->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT)
-      nvc0->base.vbo_dirty = TRUE;
+      nvc0->base.vbo_dirty = true;
 
    nvc0_update_prim_restart(nvc0, info->primitive_restart, info->restart_index);
 
    if (nvc0->base.vbo_dirty) {
       if (nvc0->screen->eng3d->oclass < GM107_3D_CLASS)
          IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
-      nvc0->base.vbo_dirty = FALSE;
+      nvc0->base.vbo_dirty = false;
    }
 
    if (unlikely(info->indirect)) {
@@ -962,10 +967,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nvc0_draw_stream_output(nvc0, info);
    } else
    if (info->indexed) {
-      boolean shorten = info->max_index <= 65535;
+      bool shorten = info->max_index <= 65535;
 
       if (info->primitive_restart && info->restart_index > 65535)
-         shorten = FALSE;
+         shorten = false;
 
       nvc0_draw_elements(nvc0, shorten,
                          info->mode, info->start, info->count,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index f180087161d..8b23a4887da 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -21,12 +21,12 @@ struct push_context {
    uint32_t restart_index;
    uint32_t instance_id;
 
-   boolean prim_restart;
-   boolean need_vertex_id;
+   bool prim_restart;
+   bool need_vertex_id;
 
    struct {
-      boolean enabled;
-      boolean value;
+      bool enabled;
+      bool value;
       unsigned stride;
       const uint8_t *data;
    } edgeflag;
@@ -47,7 +47,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
    ctx->need_vertex_id =
       nvc0->vertprog->vp.need_vertex_id && (nvc0->vertex->num_elements < 32);
 
-   ctx->edgeflag.value = TRUE;
+   ctx->edgeflag.value = true;
    ctx->edgeflag.enabled = nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS;
 
    /* silence warnings */
@@ -55,7 +55,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
    ctx->edgeflag.stride = 0;
 }
 
-static INLINE void
+static inline void
 nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias)
 {
    struct translate *translate = nvc0->vertex->translate;
@@ -78,7 +78,7 @@ nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
 {
    if (nvc0->idxbuf.buffer) {
@@ -90,7 +90,7 @@ nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
    }
 }
 
-static INLINE void
+static inline void
 nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
                        int32_t index_bias)
 {
@@ -112,7 +112,7 @@ nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
       ctx->edgeflag.data += (intptr_t)index_bias * vb->stride;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index)
 {
    unsigned i;
@@ -120,7 +120,7 @@ prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index)
 {
    unsigned i;
@@ -128,7 +128,7 @@ prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
 {
    unsigned i;
@@ -136,21 +136,21 @@ prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
    return i;
 }
 
-static INLINE boolean
+static inline bool
 ef_value(const struct push_context *ctx, uint32_t index)
 {
    float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
-   return *pf ? TRUE : FALSE;
+   return *pf ? true : false;
 }
 
-static INLINE boolean
+static inline bool
 ef_toggle(struct push_context *ctx)
 {
    ctx->edgeflag.value = !ctx->edgeflag.value;
    return ctx->edgeflag.value;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
 {
    unsigned i;
@@ -158,7 +158,7 @@ ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
 {
    unsigned i;
@@ -166,7 +166,7 @@ ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
 {
    unsigned i;
@@ -174,7 +174,7 @@ ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
    return i;
 }
 
-static INLINE unsigned
+static inline unsigned
 ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
 {
    unsigned i;
@@ -182,7 +182,7 @@ ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
    return i;
 }
 
-static INLINE void *
+static inline void *
 nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -409,7 +409,7 @@ disp_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 #define NVC0_PRIM_GL_CASE(n) \
    case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
 
-static INLINE unsigned
+static inline unsigned
 nvc0_prim_gl(unsigned prim)
 {
    switch (prim) {
@@ -427,8 +427,7 @@ nvc0_prim_gl(unsigned prim)
    NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
    NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
-   /*
-   NVC0_PRIM_GL_CASE(PATCHES); */
+   NVC0_PRIM_GL_CASE(PATCHES);
    default:
       return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
    }
@@ -483,7 +482,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
          struct pipe_context *pipe = &nvc0->base.pipe;
          struct nvc0_so_target *targ;
          targ = nvc0_so_target(info->count_from_stream_output);
-         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         pipe->get_query_result(pipe, targ->pq, true, (void *)&vert_count);
          vert_count /= targ->stride;
       }
       ctx.idxbuf = NULL; /* shut up warnings */
@@ -560,7 +559,7 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    NOUVEAU_DRV_STAT(&nvc0->screen->base, draw_calls_fallback_count, 1);
 }
 
-static INLINE void
+static inline void
 copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
@@ -568,7 +567,7 @@ copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n)
       dst[i] = elts[i] + bias;
 }
 
-static INLINE void
+static inline void
 copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
@@ -576,7 +575,7 @@ copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n)
       dst[i] = elts[i] + bias;
 }
 
-static INLINE void
+static inline void
 copy_indices_u32(uint32_t *dst, const uint32_t *elts, uint32_t bias, unsigned n)
 {
    unsigned i;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
index 725e889683f..4ea8ca3cfa2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
@@ -15,14 +15,14 @@
 #endif
 
 
-static INLINE void
+static inline void
 nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin,
                             unsigned flags, struct nouveau_bo *bo)
 {
    nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL;
 }
 
-static INLINE void
+static inline void
 nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin,
                   struct nv04_resource *res, unsigned flags)
 {
@@ -38,7 +38,7 @@ nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin,
 #define BCTX_REFN(bctx, bin, res, acc) \
    nvc0_add_resident(bctx, NVC0_BIND_##bin, res, NOUVEAU_BO_##acc)
 
-static INLINE void
+static inline void
 PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 {
    struct nouveau_pushbuf_refn ref = { bo, flags };
@@ -69,46 +69,46 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 
 #define NVC0_3D_SERIALIZE NV50_GRAPH_SERIALIZE
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
 {
    return 0x20000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
 {
    return 0x60000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_IL(int subc, int mthd, uint16_t data)
 {
    assert(data < 0x2000);
    return 0x80000000 | (data << 16) | (subc << 13) | (mthd >> 2);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 NVC0_FIFO_PKHDR_1I(int subc, int mthd, unsigned size)
 {
    return 0xa0000000 | (size << 16) | (subc << 13) | (mthd >> 2);
 }
 
 
-static INLINE uint8_t
+static inline uint8_t
 nouveau_bo_memtype(const struct nouveau_bo *bo)
 {
    return bo->config.nvc0.memtype;
 }
 
 
-static INLINE void
+static inline void
 PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data)
 {
    *push->cur++ = (uint32_t)(data >> 32);
 }
 
-static INLINE void
+static inline void
 BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -117,7 +117,7 @@ BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -126,7 +126,7 @@ BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_NI(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
@@ -135,7 +135,7 @@ BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
    PUSH_DATA (push, NVC0_FIFO_PKHDR_1I(subc, mthd, size));
 }
 
-static INLINE void
+static inline void
 IMMED_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, uint16_t data)
 {
 #ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index fce02a7cc57..d3e5676873e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -250,7 +250,7 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
 static void
 nve4_compute_validate_samplers(struct nvc0_context *nvc0)
 {
-   boolean need_flush = nve4_validate_tsc(nvc0, 5);
+   bool need_flush = nve4_validate_tsc(nvc0, 5);
    if (need_flush) {
       BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1);
       PUSH_DATA (nvc0->base.pushbuf, 0);
@@ -299,11 +299,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
 }
 
 
-static boolean
+static bool
 nve4_compute_state_validate(struct nvc0_context *nvc0)
 {
    if (!nvc0_compute_validate_program(nvc0))
-      return FALSE;
+      return false;
    if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES)
       nve4_compute_validate_textures(nvc0);
    if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS)
@@ -316,15 +316,15 @@ nve4_compute_state_validate(struct nvc0_context *nvc0)
       nvc0_validate_global_residents(nvc0,
                                      nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL);
 
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
 
    nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
    if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return FALSE;
+      return false;
    if (unlikely(nvc0->state.flushed))
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
 
-   return TRUE;
+   return true;
 }
 
 
@@ -364,7 +364,7 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
 }
 
-static INLINE uint8_t
+static inline uint8_t
 nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
 {
    if (shared_size > (32 << 10))
@@ -413,7 +413,7 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
    nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
 }
 
-static INLINE struct nve4_cp_launch_desc *
+static inline struct nve4_cp_launch_desc *
 nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
                                struct nouveau_bo **pbo, uint64_t *pgpuaddr)
 {
@@ -505,7 +505,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
       struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
       struct nv04_resource *res;
-      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+      const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i));
 
       if (!tic) {
          nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
@@ -575,18 +575,18 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
 {
    const uint32_t *data = (const uint32_t *)desc;
    unsigned i;
-   boolean zero = FALSE;
+   bool zero = false;
 
    debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
 
    for (i = 0; i < sizeof(*desc); i += 4) {
       if (data[i / 4]) {
          debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
-         zero = FALSE;
+         zero = false;
       } else
       if (!zero) {
          debug_printf("...\n");
-         zero = TRUE;
+         zero = true;
       }
    }
 
@@ -606,7 +606,7 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
    for (i = 0; i < 8; ++i) {
       uint64_t address;
       uint32_t size = desc->cb[i].size;
-      boolean valid = !!(desc->cb_mask & (1 << i));
+      bool valid = !!(desc->cb_mask & (1 << i));
 
       address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
index 4d7af54d860..7364a68a579 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -68,7 +68,7 @@ struct nve4_cp_launch_desc
    u32 unk48[16];
 };
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
 {
    memset(desc, 0, sizeof(*desc));
@@ -78,7 +78,7 @@ nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
    desc->unk47_20 = 0x300;
 }
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
                            unsigned index,
                            struct nouveau_bo *bo,
@@ -96,7 +96,7 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
    desc->cb_mask |= 1 << index;
 }
 
-static INLINE void
+static inline void
 nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
                                unsigned index,
                                const struct nvc0_constbuf *cb)
diff --git a/src/gallium/drivers/r300/Makefile.am b/src/gallium/drivers/r300/Makefile.am
index dd1a5ede19b..081f332683e 100644
--- a/src/gallium/drivers/r300/Makefile.am
+++ b/src/gallium/drivers/r300/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index baf05cea965..6ea8f24cc14 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -382,7 +382,7 @@ static void r300_clear(struct pipe_context* pipe,
             r300_get_num_cs_end_dwords(r300);
 
         /* Reserve CS space. */
-        if (dwords > (RADEON_MAX_CMDBUF_DWORDS - r300->cs->cdw)) {
+        if (dwords > (r300->cs->max_dw - r300->cs->cdw)) {
             r300_flush(&r300->context, RADEON_FLUSH_ASYNC, NULL);
         }
 
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index c35aa3b24aa..8c24ad6d98a 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -94,6 +94,8 @@ static void r300_destroy_context(struct pipe_context* context)
 
     if (r300->cs)
         r300->rws->cs_destroy(r300->cs);
+    if (r300->ctx)
+        r300->rws->ctx_destroy(r300->ctx);
 
     rc_destroy_regalloc_state(&r300->fs_regalloc_state);
 
@@ -382,7 +384,11 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
                      sizeof(struct pipe_transfer), 64,
                      UTIL_SLAB_SINGLETHREADED);
 
-    r300->cs = rws->cs_create(rws, RING_GFX, r300_flush_callback, r300, NULL);
+    r300->ctx = rws->ctx_create(rws);
+    if (!r300->ctx)
+        goto fail;
+
+    r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300, NULL);
     if (r300->cs == NULL)
         goto fail;
 
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 3873c9a31c1..18ae11a3a24 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -449,6 +449,8 @@ struct r300_context {
 
     /* The interface to the windowing system, etc. */
     struct radeon_winsys *rws;
+    /* The submission context. */
+    struct radeon_winsys_ctx *ctx;
     /* The command stream. */
     struct radeon_winsys_cs *cs;
     /* Screen. */
@@ -647,32 +649,32 @@ struct r300_context {
     for (atom = r300->first_dirty; atom != r300->last_dirty; atom++)
 
 /* Convenience cast wrappers. */
-static INLINE struct r300_query* r300_query(struct pipe_query* q)
+static inline struct r300_query* r300_query(struct pipe_query* q)
 {
     return (struct r300_query*)q;
 }
 
-static INLINE struct r300_surface* r300_surface(struct pipe_surface* surf)
+static inline struct r300_surface* r300_surface(struct pipe_surface* surf)
 {
     return (struct r300_surface*)surf;
 }
 
-static INLINE struct r300_resource* r300_resource(struct pipe_resource* tex)
+static inline struct r300_resource* r300_resource(struct pipe_resource* tex)
 {
     return (struct r300_resource*)tex;
 }
 
-static INLINE struct r300_context* r300_context(struct pipe_context* context)
+static inline struct r300_context* r300_context(struct pipe_context* context)
 {
     return (struct r300_context*)context;
 }
 
-static INLINE struct r300_fragment_shader *r300_fs(struct r300_context *r300)
+static inline struct r300_fragment_shader *r300_fs(struct r300_context *r300)
 {
     return (struct r300_fragment_shader*)r300->fs.state;
 }
 
-static INLINE void r300_mark_atom_dirty(struct r300_context *r300,
+static inline void r300_mark_atom_dirty(struct r300_context *r300,
                                         struct r300_atom *atom)
 {
     atom->dirty = TRUE;
@@ -688,7 +690,7 @@ static INLINE void r300_mark_atom_dirty(struct r300_context *r300,
     }
 }
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 r300_get_nonnull_cb(struct pipe_framebuffer_state *fb, unsigned i)
 {
     if (fb->cbufs[i])
@@ -777,12 +779,12 @@ void r300_update_derived_state(struct r300_context* r300);
 void r500_dump_rs_block(struct r300_rs_block *rs);
 
 
-static INLINE boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
+static inline boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
 {
     return SCREEN_DBG_ON(ctx->screen, flags);
 }
 
-static INLINE void CTX_DBG(struct r300_context * ctx, unsigned flags,
+static inline void CTX_DBG(struct r300_context * ctx, unsigned flags,
                        const char * fmt, ...)
 {
     if (CTX_DBG_ON(ctx, flags)) {
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 37f9641ab3e..fc150542d4b 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -46,7 +46,7 @@
 #ifdef DEBUG
 
 #define BEGIN_CS(size) do { \
-    assert(size <= (RADEON_MAX_CMDBUF_DWORDS - cs_copy->cdw)); \
+    assert(size <= (cs_copy->max_dw - cs_copy->cdw)); \
     cs_count = size; \
 } while (0)
 
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index 39eb73da65d..b39624dad5f 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -77,14 +77,14 @@ void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
 /* Return TRUE if the shader was switched and should be re-emitted. */
 boolean r300_pick_fragment_shader(struct r300_context* r300);
 
-static INLINE boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
+static inline boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
 {
     if (!fs)
         return FALSE;
     return (fs->shader->code.writes_depth) ? TRUE : FALSE;
 }
 
-static INLINE boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs)
+static inline boolean r300_fragment_shader_writes_all(struct r300_fragment_shader *fs)
 {
     if (!fs)
         return FALSE;
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 01b83b87fcf..4dd8156f616 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -146,10 +146,11 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
 
     if (q->type == PIPE_QUERY_GPU_FINISHED) {
         if (wait) {
-            r300->rws->buffer_wait(q->buf, RADEON_USAGE_READWRITE);
+            r300->rws->buffer_wait(q->buf, PIPE_TIMEOUT_INFINITE,
+                                   RADEON_USAGE_READWRITE);
             vresult->b = TRUE;
         } else {
-            vresult->b = !r300->rws->buffer_is_busy(q->buf, RADEON_USAGE_READWRITE);
+            vresult->b = r300->rws->buffer_wait(q->buf, 0, RADEON_USAGE_READWRITE);
         }
         return vresult->b;
     }
@@ -168,8 +169,6 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
         map++;
     }
 
-    r300->rws->buffer_unmap(q->cs_buf);
-
     if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
         vresult->b = temp != 0;
     } else {
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 4c951d14f10..0487b11e775 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -215,7 +215,7 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
     cs_dwords += r300_get_num_cs_end_dwords(r300);
 
     /* Reserve requested CS space. */
-    if (cs_dwords > (RADEON_MAX_CMDBUF_DWORDS - r300->cs->cdw)) {
+    if (cs_dwords > (r300->cs->max_dw - r300->cs->cdw)) {
         r300_flush(&r300->context, RADEON_FLUSH_ASYNC, NULL);
         flushed = TRUE;
     }
@@ -871,7 +871,7 @@ struct r300_render {
     uint8_t *vbo_ptr;
 };
 
-static INLINE struct r300_render*
+static inline struct r300_render*
 r300_render(struct vbuf_render* render)
 {
     return (struct r300_render*)render;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index a7bca915f57..4ca0b268bde 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -191,6 +191,10 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+        case PIPE_CAP_DEPTH_BOUNDS_TEST:
             return 0;
 
         /* SWTCL-only features. */
@@ -427,7 +431,7 @@ static int r300_get_video_param(struct pipe_screen *screen,
  * Whether the format matches:
  *   PIPE_FORMAT_?10?10?10?2_UNORM
  */
-static INLINE boolean
+static inline boolean
 util_format_is_rgba1010102_variant(const struct util_format_description *desc)
 {
    static const unsigned size[4] = {10, 10, 10, 2};
@@ -660,14 +664,6 @@ static void r300_fence_reference(struct pipe_screen *screen,
     rws->fence_reference(ptr, fence);
 }
 
-static boolean r300_fence_signalled(struct pipe_screen *screen,
-                                    struct pipe_fence_handle *fence)
-{
-    struct radeon_winsys *rws = r300_screen(screen)->rws;
-
-    return rws->fence_wait(rws, fence, 0);
-}
-
 static boolean r300_fence_finish(struct pipe_screen *screen,
                                  struct pipe_fence_handle *fence,
                                  uint64_t timeout)
@@ -712,7 +708,6 @@ struct pipe_screen* r300_screen_create(struct radeon_winsys *rws)
     r300screen->screen.is_video_format_supported = vl_video_buffer_is_format_supported;
     r300screen->screen.context_create = r300_create_context;
     r300screen->screen.fence_reference = r300_fence_reference;
-    r300screen->screen.fence_signalled = r300_fence_signalled;
     r300screen->screen.fence_finish = r300_fence_finish;
 
     r300_init_screen_resource_functions(r300screen);
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index 7bba39bf12b..e15c3c7de0c 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -51,11 +51,11 @@ struct r300_screen {
 
 
 /* Convenience cast wrappers. */
-static INLINE struct r300_screen* r300_screen(struct pipe_screen* screen) {
+static inline struct r300_screen* r300_screen(struct pipe_screen* screen) {
     return (struct r300_screen*)screen;
 }
 
-static INLINE struct radeon_winsys *
+static inline struct radeon_winsys *
 radeon_winsys(struct pipe_screen *screen) {
     return r300_screen(screen)->rws;
 }
@@ -102,12 +102,12 @@ radeon_winsys(struct pipe_screen *screen) {
 #define DBG_P_STAT      (1 << 25)
 /*@}*/
 
-static INLINE boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
+static inline boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
 {
     return (screen->debug & flags) ? TRUE : FALSE;
 }
 
-static INLINE void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
+static inline void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
                               const char * fmt, ...)
 {
     if (SCREEN_DBG_ON(screen, flags)) {
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index de557b57776..6451a2c8df2 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -96,7 +96,7 @@ r300_buffer_transfer_map( struct pipe_context *context,
 
         /* Check if mapping this buffer would cause waiting for the GPU. */
         if (r300->rws->cs_is_buffer_referenced(r300->cs, rbuf->cs_buf, RADEON_USAGE_READWRITE) ||
-            r300->rws->buffer_is_busy(rbuf->buf, RADEON_USAGE_READWRITE)) {
+            !r300->rws->buffer_wait(rbuf->buf, 0, RADEON_USAGE_READWRITE)) {
             unsigned i;
             struct pb_buffer *new_buf;
 
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.h b/src/gallium/drivers/r300/r300_screen_buffer.h
index b4c8520039b..14b849c8c93 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.h
+++ b/src/gallium/drivers/r300/r300_screen_buffer.h
@@ -46,7 +46,7 @@ struct pipe_resource *r300_buffer_create(struct pipe_screen *screen,
 
 /* Inline functions. */
 
-static INLINE struct r300_buffer *r300_buffer(struct pipe_resource *buffer)
+static inline struct r300_buffer *r300_buffer(struct pipe_resource *buffer)
 {
     return (struct r300_buffer *)buffer;
 }
diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h
index b756048c6c7..93bbc9d4a96 100644
--- a/src/gallium/drivers/r300/r300_shader_semantics.h
+++ b/src/gallium/drivers/r300/r300_shader_semantics.h
@@ -46,7 +46,7 @@ struct r300_shader_semantics {
     int num_generic;
 };
 
-static INLINE void r300_shader_semantics_reset(
+static inline void r300_shader_semantics_reset(
     struct r300_shader_semantics* info)
 {
     int i;
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index e886df87a60..d99d5ae0152 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -844,7 +844,7 @@ static void r300_tex_set_tiling_flags(struct r300_context *r300,
         tex->tex.macrotile[level]) {
         r300->rws->buffer_set_tiling(tex->buf, r300->cs,
                 tex->tex.microtile, tex->tex.macrotile[level],
-                0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0,
                 tex->tex.stride_in_bytes[0], false);
 
         tex->surface_level = level;
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index feec494c4dc..fbd91cda9fe 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -32,13 +32,13 @@
 
 /* Some maths. These should probably find their way to u_math, if needed. */
 
-static INLINE int pack_float_16_6x(float f) {
+static inline int pack_float_16_6x(float f) {
     return ((int)(f * 6.0) & 0xffff);
 }
 
 /* Blend state. */
 
-static INLINE uint32_t r300_translate_blend_function(int blend_func,
+static inline uint32_t r300_translate_blend_function(int blend_func,
                                                      boolean clamp)
 {
     switch (blend_func) {
@@ -60,7 +60,7 @@ static INLINE uint32_t r300_translate_blend_function(int blend_func,
     return 0;
 }
 
-static INLINE uint32_t r300_translate_blend_factor(int blend_fact)
+static inline uint32_t r300_translate_blend_factor(int blend_fact)
 {
     switch (blend_fact) {
         case PIPE_BLENDFACTOR_ONE:
@@ -113,7 +113,7 @@ static INLINE uint32_t r300_translate_blend_factor(int blend_fact)
 
 /* DSA state. */
 
-static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func)
+static inline uint32_t r300_translate_depth_stencil_function(int zs_func)
 {
     switch (zs_func) {
         case PIPE_FUNC_NEVER:
@@ -141,7 +141,7 @@ static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func)
     return 0;
 }
 
-static INLINE uint32_t r300_translate_stencil_op(int s_op)
+static inline uint32_t r300_translate_stencil_op(int s_op)
 {
     switch (s_op) {
         case PIPE_STENCIL_OP_KEEP:
@@ -168,7 +168,7 @@ static INLINE uint32_t r300_translate_stencil_op(int s_op)
     return 0;
 }
 
-static INLINE uint32_t r300_translate_alpha_function(int alpha_func)
+static inline uint32_t r300_translate_alpha_function(int alpha_func)
 {
     switch (alpha_func) {
         case PIPE_FUNC_NEVER:
@@ -195,7 +195,7 @@ static INLINE uint32_t r300_translate_alpha_function(int alpha_func)
     return 0;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 r300_translate_polygon_mode_front(unsigned mode) {
     switch (mode)
     {
@@ -213,7 +213,7 @@ r300_translate_polygon_mode_front(unsigned mode) {
     }
 }
 
-static INLINE uint32_t
+static inline uint32_t
 r300_translate_polygon_mode_back(unsigned mode) {
     switch (mode)
     {
@@ -233,7 +233,7 @@ r300_translate_polygon_mode_back(unsigned mode) {
 
 /* Texture sampler state. */
 
-static INLINE uint32_t r300_translate_wrap(int wrap)
+static inline uint32_t r300_translate_wrap(int wrap)
 {
     switch (wrap) {
         case PIPE_TEX_WRAP_REPEAT:
@@ -259,7 +259,7 @@ static INLINE uint32_t r300_translate_wrap(int wrap)
     }
 }
 
-static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
+static inline uint32_t r300_translate_tex_filters(int min, int mag, int mip,
                                                   boolean is_anisotropic)
 {
     uint32_t retval = 0;
@@ -308,7 +308,7 @@ static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
     return retval;
 }
 
-static INLINE uint32_t r300_anisotropy(unsigned max_aniso)
+static inline uint32_t r300_anisotropy(unsigned max_aniso)
 {
     if (max_aniso >= 16) {
         return R300_TX_MAX_ANISO_16_TO_1;
@@ -323,7 +323,7 @@ static INLINE uint32_t r300_anisotropy(unsigned max_aniso)
     }
 }
 
-static INLINE uint32_t r500_anisotropy(unsigned max_aniso)
+static inline uint32_t r500_anisotropy(unsigned max_aniso)
 {
     if (!max_aniso) {
         return 0;
@@ -336,7 +336,7 @@ static INLINE uint32_t r500_anisotropy(unsigned max_aniso)
 }
 
 /* Translate pipe_formats into PSC vertex types. */
-static INLINE uint16_t
+static inline uint16_t
 r300_translate_vertex_data_type(enum pipe_format format) {
     uint32_t result = 0;
     const struct util_format_description *desc;
@@ -410,7 +410,7 @@ r300_translate_vertex_data_type(enum pipe_format format) {
     return result;
 }
 
-static INLINE uint16_t
+static inline uint16_t
 r300_translate_vertex_data_swizzle(enum pipe_format format) {
     const struct util_format_description *desc;
     unsigned i, swizzle = 0;
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 6c01c0d21e4..5e4d50df27d 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -1063,7 +1063,7 @@ r300_texture_create_object(struct r300_screen *rscreen,
 
     rws->buffer_set_tiling(tex->buf, NULL,
             tex->tex.microtile, tex->tex.macrotile[0],
-            0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0,
             tex->tex.stride_in_bytes[0], false);
 
     return tex;
diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c
index b87164ba836..44303792f51 100644
--- a/src/gallium/drivers/r300/r300_transfer.c
+++ b/src/gallium/drivers/r300/r300_transfer.c
@@ -41,7 +41,7 @@ struct r300_transfer {
 };
 
 /* Convenience cast wrapper. */
-static INLINE struct r300_transfer*
+static inline struct r300_transfer*
 r300_transfer(struct pipe_transfer* transfer)
 {
     return (struct r300_transfer*)transfer;
@@ -120,7 +120,7 @@ r300_texture_transfer_map(struct pipe_context *ctx,
         referenced_hw = TRUE;
     } else {
         referenced_hw =
-            r300->rws->buffer_is_busy(tex->buf, RADEON_USAGE_READWRITE);
+            !r300->rws->buffer_wait(tex->buf, 0, RADEON_USAGE_READWRITE);
     }
 
     trans = CALLOC_STRUCT(r300_transfer);
@@ -251,16 +251,12 @@ void r300_texture_transfer_unmap(struct pipe_context *ctx,
     struct r300_resource *tex = r300_resource(transfer->resource);
 
     if (trans->linear_texture) {
-        rws->buffer_unmap(trans->linear_texture->cs_buf);
-
         if (transfer->usage & PIPE_TRANSFER_WRITE) {
             r300_copy_into_tiled_texture(ctx, trans);
         }
 
         pipe_resource_reference(
             (struct pipe_resource**)&trans->linear_texture, NULL);
-    } else {
-        rws->buffer_unmap(tex->cs_buf);
     }
     FREE(transfer);
 }
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index dc0d90d759b..8317da727a2 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index 295cb4d80b7..42e8b0b1761 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -160,6 +160,9 @@ int egcm_load_index_reg(struct r600_bytecode *bc, unsigned id, bool inside_alu_c
 	alu.op = ALU_OP1_MOVA_INT;
 	alu.src[0].sel = bc->index_reg[id];
 	alu.src[0].chan = 0;
+	if (bc->chip_class == CAYMAN)
+		alu.dst.sel = id == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
+
 	alu.last = 1;
 	r = r600_bytecode_add_alu(bc, &alu);
 	if (r)
@@ -167,12 +170,14 @@ int egcm_load_index_reg(struct r600_bytecode *bc, unsigned id, bool inside_alu_c
 
 	bc->ar_loaded = 0; /* clobbered */
 
-	memset(&alu, 0, sizeof(alu));
-	alu.op = id == 0 ? ALU_OP0_SET_CF_IDX0 : ALU_OP0_SET_CF_IDX1;
-	alu.last = 1;
-	r = r600_bytecode_add_alu(bc, &alu);
-	if (r)
-		return r;
+	if (bc->chip_class == EVERGREEN) {
+		memset(&alu, 0, sizeof(alu));
+		alu.op = id == 0 ? ALU_OP0_SET_CF_IDX0 : ALU_OP0_SET_CF_IDX1;
+		alu.last = 1;
+		r = r600_bytecode_add_alu(bc, &alu);
+		if (r)
+			return r;
+	}
 
 	/* Must split ALU group as index only applies to following group */
 	if (inside_alu_clause) {
diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h
index b534872f062..97e230f56c7 100644
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -521,4 +521,11 @@
 
 #define V_SQ_REL_ABSOLUTE 0
 #define V_SQ_REL_RELATIVE 1
+
+/* CAYMAN has special encoding for MOVA_INT destination */
+#define CM_V_SQ_MOVA_DST_AR_X 0
+#define CM_V_SQ_MOVA_DST_CF_PC 1
+#define CM_V_SQ_MOVA_DST_CF_IDX0 2
+#define CM_V_SQ_MOVA_DST_CF_IDX1 3
+
 #endif
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 4c3c34cd664..c52e43e9c2a 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -163,7 +163,7 @@ static void evergreen_cs_set_vertex_buffer(
 	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 	state->enabled_mask |= 1 << vb_index;
 	state->dirty_mask |= 1 << vb_index;
-	state->atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &state->atom);
 }
 
 static void evergreen_cs_set_constant_buffer(
@@ -226,7 +226,7 @@ void *evergreen_create_compute_state(
 	}
 #else
 	memset(&shader->binary, 0, sizeof(shader->binary));
-	radeon_elf_read(code, header->num_bytes, &shader->binary, true);
+	radeon_elf_read(code, header->num_bytes, &shader->binary);
 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
 
 	shader->code_bo = r600_compute_buffer_alloc_vram(ctx->screen,
@@ -487,6 +487,12 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 	/* Emit constant buffer state */
 	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
 
+	/* Emit sampler state */
+	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
+
+	/* Emit sampler view (texture resource) state */
+	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
+
 	/* Emit compute shader state */
 	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);
 
@@ -655,25 +661,6 @@ static void evergreen_set_compute_resources(struct pipe_context * ctx_,
 	}
 }
 
-void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
-		unsigned start_slot, unsigned count,
-		struct pipe_sampler_view **views)
-{
-	struct r600_pipe_sampler_view **resource =
-		(struct r600_pipe_sampler_view **)views;
-
-	for (unsigned i = 0; i < count; i++)	{
-		if (resource[i]) {
-			assert(i+1 < 12);
-			/* XXX: Implement */
-			assert(!"Compute samplers not implemented.");
-			///FETCH0 = VTX0 (param buffer),
-			//FETCH1 = VTX1 (global buffer pool), FETCH2... = TEX
-		}
-	}
-}
-
-
 static void evergreen_set_global_binding(
 	struct pipe_context *ctx_, unsigned first, unsigned n,
 	struct pipe_resource **resources,
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 4ddbc0beba5..6a91d4709f4 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -32,7 +32,7 @@
 #include "evergreen_compute.h"
 #include "util/u_math.h"
 
-static INLINE unsigned evergreen_array_mode(unsigned mode)
+static inline unsigned evergreen_array_mode(unsigned mode)
 {
 	switch (mode) {
 	case RADEON_SURF_MODE_LINEAR_ALIGNED:	return V_028C70_ARRAY_LINEAR_ALIGNED;
@@ -485,7 +485,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 	rs->offset_enable = state->offset_point || state->offset_line || state->offset_tri;
 
 	if (state->point_size_per_vertex) {
@@ -896,7 +896,7 @@ static void evergreen_set_scissor_states(struct pipe_context *ctx,
 
 	for (i = start_slot; i < start_slot + num_scissors; i++) {
 		rctx->scissor[i].scissor = state[i - start_slot];
-		rctx->scissor[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[i].atom);
 	}
 }
 
@@ -1028,7 +1028,10 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 	macro_aspect = rtex->surface.mtilea;
 	bankw = rtex->surface.bankw;
 	bankh = rtex->surface.bankh;
-	fmask_bankh = rtex->fmask.bank_height;
+	if (rtex->fmask.size)
+		fmask_bankh = rtex->fmask.bank_height;
+	else
+		fmask_bankh = rtex->surface.bankh;
 	tile_split = eg_tile_split(tile_split);
 	macro_aspect = eg_macro_tile_aspect(macro_aspect);
 	bankw = eg_bank_wh(bankw);
@@ -1149,10 +1152,11 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 	surf->cb_color_attrib = color_attrib;
 	if (rtex->fmask.size) {
 		surf->cb_color_fmask = (base_offset + rtex->fmask.offset) >> 8;
+		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
 	} else {
 		surf->cb_color_fmask = surf->cb_color_base;
+		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(slice);
 	}
-	surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
 
 	surf->color_initialized = true;
 }
@@ -1342,11 +1346,11 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (rctx->alphatest_state.bypass != alphatest_bypass) {
 			rctx->alphatest_state.bypass = alphatest_bypass;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 		if (rctx->alphatest_state.cb0_export_16bpc != export_16bpc) {
 			rctx->alphatest_state.cb0_export_16bpc = export_16bpc;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 	}
 
@@ -1362,28 +1366,28 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (state->zsbuf->format != rctx->poly_offset_state.zs_format) {
 			rctx->poly_offset_state.zs_format = state->zsbuf->format;
-			rctx->poly_offset_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 		}
 
 		if (rctx->db_state.rsurf != surf) {
 			rctx->db_state.rsurf = surf;
-			rctx->db_state.atom.dirty = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	} else if (rctx->db_state.rsurf) {
 		rctx->db_state.rsurf = NULL;
-		rctx->db_state.atom.dirty = true;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 	if (rctx->cb_misc_state.nr_cbufs != state->nr_cbufs) {
 		rctx->cb_misc_state.nr_cbufs = state->nr_cbufs;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	if (state->nr_cbufs == 0 && rctx->alphatest_state.bypass) {
 		rctx->alphatest_state.bypass = false;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 
 	log_samples = util_logbase2(rctx->framebuffer.nr_samples);
@@ -1392,7 +1396,7 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 	     rctx->b.family == CHIP_RV770) &&
 	    rctx->db_misc_state.log_samples != log_samples) {
 		rctx->db_misc_state.log_samples = log_samples;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 
@@ -1420,7 +1424,7 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 		rctx->framebuffer.atom.num_dw += 4;
 	}
 
-	rctx->framebuffer.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 
 	r600_set_sample_locations_constant_buffer(rctx);
 }
@@ -1434,7 +1438,7 @@ static void evergreen_set_min_samples(struct pipe_context *ctx, unsigned min_sam
 
 	rctx->ps_iter_samples = min_samples;
 	if (rctx->framebuffer.nr_samples > 1) {
-		rctx->framebuffer.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 	}
 }
 
@@ -1732,10 +1736,10 @@ static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_
 
 	r600_write_context_reg_seq(cs, R_028238_CB_TARGET_MASK, 2);
 	radeon_emit(cs, a->blend_colormask & fb_colormask); /* R_028238_CB_TARGET_MASK */
-	/* Always enable the first colorbuffer in CB_SHADER_MASK. This
-	 * will assure that the alpha-test will work even if there is
-	 * no colorbuffer bound. */
-	radeon_emit(cs, 0xf | (a->dual_src_blend ? ps_colormask : 0) | fb_colormask); /* R_02823C_CB_SHADER_MASK */
+	/* This must match the used export instructions exactly.
+	 * Other values may lead to undefined behavior and hangs.
+	 */
+	radeon_emit(cs, ps_colormask); /* R_02823C_CB_SHADER_MASK */
 }
 
 static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
@@ -1980,7 +1984,7 @@ static void evergreen_emit_cs_constant_buffers(struct r600_context *rctx, struct
 
 static void evergreen_emit_sampler_views(struct r600_context *rctx,
 					 struct r600_samplerview_state *state,
-					 unsigned resource_id_base)
+					 unsigned resource_id_base, unsigned pkt_flags)
 {
 	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
@@ -1993,7 +1997,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 		rview = state->views[resource_index];
 		assert(rview);
 
-		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0));
+		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags);
 		radeon_emit(cs, (resource_id_base + resource_index) * 8);
 		radeon_emit_array(cs, rview->tex_resource_words, 8);
 
@@ -2002,11 +2006,11 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 					      rview->tex_resource->b.b.nr_samples > 1 ?
 						      RADEON_PRIO_SHADER_TEXTURE_MSAA :
 						      RADEON_PRIO_SHADER_TEXTURE_RO);
-		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
 		radeon_emit(cs, reloc);
 
 		if (!rview->skip_mip_address_reloc) {
-			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
 			radeon_emit(cs, reloc);
 		}
 	}
@@ -2015,23 +2019,33 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 
 static void evergreen_emit_vs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views, 176 + R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views,
+	                             176 + R600_MAX_CONST_BUFFERS, 0);
 }
 
 static void evergreen_emit_gs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views, 336 + R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views,
+	                             336 + R600_MAX_CONST_BUFFERS, 0);
 }
 
 static void evergreen_emit_ps_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views, R600_MAX_CONST_BUFFERS);
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views,
+	                             R600_MAX_CONST_BUFFERS, 0);
+}
+
+static void evergreen_emit_cs_sampler_views(struct r600_context *rctx, struct r600_atom *atom)
+{
+	evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views,
+	                             816 + 2, RADEON_CP_PACKET3_COMPUTE_MODE);
 }
 
 static void evergreen_emit_sampler_states(struct r600_context *rctx,
 				struct r600_textures_info *texinfo,
 				unsigned resource_id_base,
-				unsigned border_index_reg)
+				unsigned border_index_reg,
+				unsigned pkt_flags)
 {
 	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
@@ -2043,7 +2057,7 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx,
 		rstate = texinfo->states.states[i];
 		assert(rstate);
 
-		radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0));
+		radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0) | pkt_flags);
 		radeon_emit(cs, (resource_id_base + i) * 3);
 		radeon_emit_array(cs, rstate->tex_sampler_words, 3);
 
@@ -2058,17 +2072,27 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx,
 
 static void evergreen_emit_vs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_VERTEX], 18, R_00A414_TD_VS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_VERTEX], 18,
+	                              R_00A414_TD_VS_SAMPLER0_BORDER_INDEX, 0);
 }
 
 static void evergreen_emit_gs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY], 36, R_00A428_TD_GS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY], 36,
+	                              R_00A428_TD_GS_SAMPLER0_BORDER_INDEX, 0);
 }
 
 static void evergreen_emit_ps_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
 {
-	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT], 0, R_00A400_TD_PS_SAMPLER0_BORDER_INDEX);
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT], 0,
+	                              R_00A400_TD_PS_SAMPLER0_BORDER_INDEX, 0);
+}
+
+static void evergreen_emit_cs_sampler_states(struct r600_context *rctx, struct r600_atom *atom)
+{
+	evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE], 90,
+	                              R_00A464_TD_CS_SAMPLER0_BORDER_INDEX,
+	                              RADEON_CP_PACKET3_COMPUTE_MODE);
 }
 
 static void evergreen_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a)
@@ -3176,7 +3200,7 @@ void evergreen_update_db_shader_control(struct r600_context * rctx)
 
 	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
@@ -3431,12 +3455,14 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].states.atom, id++, evergreen_emit_vs_sampler_states, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].states.atom, id++, evergreen_emit_gs_sampler_states, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].states.atom, id++, evergreen_emit_ps_sampler_states, 0);
+	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom, id++, evergreen_emit_cs_sampler_states, 0);
 	/* resources */
 	r600_init_atom(rctx, &rctx->vertex_buffer_state.atom, id++, evergreen_fs_emit_vertex_buffers, 0);
 	r600_init_atom(rctx, &rctx->cs_vertex_buffer_state.atom, id++, evergreen_cs_emit_vertex_buffers, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_VERTEX].views.atom, id++, evergreen_emit_vs_sampler_views, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_GEOMETRY].views.atom, id++, evergreen_emit_gs_sampler_views, 0);
 	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_FRAGMENT].views.atom, id++, evergreen_emit_ps_sampler_views, 0);
+	r600_init_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom, id++, evergreen_emit_cs_sampler_views, 0);
 
 	r600_init_atom(rctx, &rctx->vgt_state.atom, id++, r600_emit_vgt_state, 10);
 
@@ -3466,8 +3492,8 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	}
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5);
-	rctx->atoms[id++] = &rctx->b.streamout.begin_atom;
-	rctx->atoms[id++] = &rctx->b.streamout.enable_atom;
+	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
+	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
 	r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0);
 	r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0);
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index cd4ff46b103..ad6ad434b78 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -1253,6 +1253,11 @@
 #define R_00A430_TD_GS_SAMPLER0_BORDER_GREEN         0x00A430
 #define R_00A434_TD_GS_SAMPLER0_BORDER_BLUE          0x00A434
 #define R_00A438_TD_GS_SAMPLER0_BORDER_ALPHA         0x00A438
+#define R_00A464_TD_CS_SAMPLER0_BORDER_INDEX         0x00A464
+#define R_00A468_TD_CS_SAMPLER0_BORDER_RED           0x00A468
+#define R_00A46C_TD_CS_SAMPLER0_BORDER_GREEN         0x00A46C
+#define R_00A470_TD_CS_SAMPLER0_BORDER_BLUE          0x00A470
+#define R_00A474_TD_CS_SAMPLER0_BORDER_ALPHA         0x00A474
 
 #define R_03C000_SQ_TEX_SAMPLER_WORD0_0              0x03C000
 #define   S_03C000_CLAMP_X(x)                          (((x) & 0x7) << 0)
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 01262a59e90..b0002c3b50f 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -145,7 +145,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 	rctx->db_misc_state.copy_depth = util_format_has_depth(desc);
 	rctx->db_misc_state.copy_stencil = util_format_has_stencil(desc);
 	rctx->db_misc_state.copy_sample = first_sample;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 
 	for (level = first_level; level <= last_level; level++) {
 		if (!staging && !(texture->dirty_level_mask & (1 << level)))
@@ -162,7 +162,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 
 				if (sample != rctx->db_misc_state.copy_sample) {
 					rctx->db_misc_state.copy_sample = sample;
-					rctx->db_misc_state.atom.dirty = true;
+					r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 				}
 
 				surf_tmpl.format = texture->resource.b.b.format;
@@ -197,7 +197,7 @@ static void r600_blit_decompress_depth(struct pipe_context *ctx,
 
 	/* reenable compression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_through_cb = false;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 }
 
 static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
@@ -210,7 +210,7 @@ static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
 
 	/* Enable decompression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_in_place = true;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 
 	surf_tmpl.format = texture->resource.b.b.format;
 
@@ -248,7 +248,7 @@ static void r600_blit_decompress_depth_in_place(struct r600_context *rctx,
 
 	/* Disable decompression in DB_RENDER_CONTROL */
 	rctx->db_misc_state.flush_depthstencil_in_place = false;
-	rctx->db_misc_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 }
 
 void r600_decompress_depth_textures(struct r600_context *rctx,
@@ -396,6 +396,8 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR && rctx->b.chip_class >= EVERGREEN) {
 		evergreen_do_fast_color_clear(&rctx->b, fb, &rctx->framebuffer.atom,
 					      &buffers, color);
+		if (!buffers)
+			return; /* all buffers have been fast cleared */
 	}
 
 	if (buffers & PIPE_CLEAR_COLOR) {
@@ -435,10 +437,10 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
                    fb->zsbuf->u.tex.last_layer == util_max_layer(&rtex->resource.b.b, level)) {
 			if (rtex->depth_clear_value != depth) {
 				rtex->depth_clear_value = depth;
-				rctx->db_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
 			}
 			rctx->db_misc_state.htile_clear = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	}
 
@@ -451,7 +453,7 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	/* disable fast clear */
 	if (rctx->db_misc_state.htile_clear) {
 		rctx->db_misc_state.htile_clear = false;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
diff --git a/src/gallium/drivers/r600/r600_formats.h b/src/gallium/drivers/r600/r600_formats.h
index fa374d92e6f..9533aaa1378 100644
--- a/src/gallium/drivers/r600/r600_formats.h
+++ b/src/gallium/drivers/r600/r600_formats.h
@@ -64,7 +64,7 @@
 #define     ENDIAN_8IN32                    2
 #define     ENDIAN_8IN64                    3
 
-static INLINE unsigned r600_endian_swap(unsigned size)
+static inline unsigned r600_endian_swap(unsigned size)
 {
 	if (R600_BIG_ENDIAN) {
 		switch (size) {
@@ -82,7 +82,7 @@ static INLINE unsigned r600_endian_swap(unsigned size)
 	}
 }
 
-static INLINE bool r600_is_vertex_format_supported(enum pipe_format format)
+static inline bool r600_is_vertex_format_supported(enum pipe_format format)
 {
 	const struct util_format_description *desc = util_format_description(format);
 	unsigned i;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 8eb0c6806b9..64451516c23 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -51,13 +51,13 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 		unsigned i;
 
 		/* The number of dwords all the dirty states would take. */
-		for (i = 0; i < R600_NUM_ATOMS; i++) {
-			if (ctx->atoms[i] && ctx->atoms[i]->dirty) {
-				num_dw += ctx->atoms[i]->num_dw;
-				if (ctx->screen->b.trace_bo) {
-					num_dw += R600_TRACE_CS_DWORDS;
-				}
+		i = r600_next_dirty_atom(ctx, 0);
+		while (i < R600_NUM_ATOMS) {
+			num_dw += ctx->atoms[i]->num_dw;
+			if (ctx->screen->b.trace_bo) {
+				num_dw += R600_TRACE_CS_DWORDS;
 			}
+			i = r600_next_dirty_atom(ctx, i + 1);
 		}
 
 		/* The upper-bound of how much space a draw command would take. */
@@ -68,7 +68,8 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	}
 
 	/* Count in queries_suspend. */
-	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend;
+	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend +
+		  ctx->b.num_cs_dw_timer_queries_suspend;
 
 	/* Count in streamout_end at the end of CS. */
 	if (ctx->b.streamout.begin_emitted) {
@@ -92,7 +93,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	num_dw += 10;
 
 	/* Flush if there's not enough space. */
-	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+	if (num_dw > ctx->b.rings.gfx.cs->max_dw) {
 		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }
@@ -295,43 +296,45 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	r600_emit_command_buffer(ctx->b.rings.gfx.cs, &ctx->start_cs_cmd);
 
 	/* Re-emit states. */
-	ctx->alphatest_state.atom.dirty = true;
-	ctx->blend_color.atom.dirty = true;
-	ctx->cb_misc_state.atom.dirty = true;
-	ctx->clip_misc_state.atom.dirty = true;
-	ctx->clip_state.atom.dirty = true;
-	ctx->db_misc_state.atom.dirty = true;
-	ctx->db_state.atom.dirty = true;
-	ctx->framebuffer.atom.dirty = true;
-	ctx->pixel_shader.atom.dirty = true;
-	ctx->poly_offset_state.atom.dirty = true;
-	ctx->vgt_state.atom.dirty = true;
-	ctx->sample_mask.atom.dirty = true;
+	r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->blend_color.atom);
+	r600_mark_atom_dirty(ctx, &ctx->cb_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->clip_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->clip_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->db_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
+	r600_mark_atom_dirty(ctx, &ctx->pixel_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->poly_offset_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->vgt_state.atom);
+	r600_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
 	for (i = 0; i < R600_MAX_VIEWPORTS; i++) {
-		ctx->scissor[i].atom.dirty = true;
-		ctx->viewport[i].atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->scissor[i].atom);
+		r600_mark_atom_dirty(ctx, &ctx->viewport[i].atom);
 	}
-	ctx->config_state.atom.dirty = true;
-	ctx->stencil_ref.atom.dirty = true;
-	ctx->vertex_fetch_shader.atom.dirty = true;
-	ctx->export_shader.atom.dirty = true;
-	ctx->shader_stages.atom.dirty = true;
+	if (ctx->b.chip_class < EVERGREEN) {
+		r600_mark_atom_dirty(ctx, &ctx->config_state.atom);
+	}
+	r600_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
+	r600_mark_atom_dirty(ctx, &ctx->vertex_fetch_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->export_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->shader_stages.atom);
 	if (ctx->gs_shader) {
-		ctx->geometry_shader.atom.dirty = true;
-		ctx->gs_rings.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->geometry_shader.atom);
+		r600_mark_atom_dirty(ctx, &ctx->gs_rings.atom);
 	}
-	ctx->vertex_shader.atom.dirty = true;
-	ctx->b.streamout.enable_atom.dirty = true;
+	r600_mark_atom_dirty(ctx, &ctx->vertex_shader.atom);
+	r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 
 	if (ctx->blend_state.cso)
-		ctx->blend_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->blend_state.atom);
 	if (ctx->dsa_state.cso)
-		ctx->dsa_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->dsa_state.atom);
 	if (ctx->rasterizer_state.cso)
-		ctx->rasterizer_state.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->rasterizer_state.atom);
 
 	if (ctx->b.chip_class <= R700) {
-		ctx->seamless_cube_map.atom.dirty = true;
+		r600_mark_atom_dirty(ctx, &ctx->seamless_cube_map.atom);
 	}
 
 	ctx->vertex_buffer_state.dirty_mask = ctx->vertex_buffer_state.enabled_mask;
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index 72e2dc42f7e..faf538ccbb5 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -84,7 +84,7 @@ static void llvm_load_system_value(
 #else
 	LLVMValueRef reg = lp_build_const_int32(
 			ctx->soa.bld_base.base.gallivm, chan);
-	ctx->system_values[index] = build_intrinsic(
+	ctx->system_values[index] = lp_build_intrinsic(
 			ctx->soa.bld_base.base.gallivm->builder,
 			"llvm.R600.load.input",
 			ctx->soa.bld_base.base.elem_type, &reg, 1,
@@ -111,9 +111,9 @@ llvm_load_input_vector(
 			Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex,
 				lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2) + 1), "");
 			LLVMValueRef HalfVec[2] = {
-				build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy",
+				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy",
 					VecType, Args, ArgCount, LLVMReadNoneAttribute),
-				build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw",
+				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw",
 					VecType, Args, ArgCount, LLVMReadNoneAttribute)
 			};
 			LLVMValueRef MaskInputs[4] = {
@@ -127,7 +127,7 @@ llvm_load_input_vector(
 				Mask, "");
 		} else {
 			VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 4);
-			return build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const",
+			return lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const",
 				VecType, Args, ArgCount, LLVMReadNoneAttribute);
 		}
 }
@@ -153,7 +153,7 @@ llvm_load_input_helper(
 		arg_count = 1;
 	}
 
-	return build_intrinsic(bb->gallivm->builder, intrinsic,
+	return lp_build_intrinsic(bb->gallivm->builder, intrinsic,
 		bb->elem_type, &arg[0], arg_count, LLVMReadNoneAttribute);
 }
 #endif
@@ -332,7 +332,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 			args[2] = lp_build_const_int32(base->gallivm, so->output[i].output_buffer);
 			args[3] = lp_build_const_int32(base->gallivm, ((1 << num_components) - 1) << start_component);
 			lp_build_intrinsic(base->gallivm->builder, "llvm.R600.store.stream.output",
-				LLVMVoidTypeInContext(base->gallivm->context), args, 4);
+				LLVMVoidTypeInContext(base->gallivm->context), args, 4, 0);
 		}
 	}
 
@@ -356,7 +356,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -373,7 +373,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE);
 						args[0] = output;
 						args[1] = base_vector;
-						adjusted_elements[chan] = build_intrinsic(base->gallivm->builder,
+						adjusted_elements[chan] = lp_build_intrinsic(base->gallivm->builder,
 							"llvm.AMDGPU.dp4", bld_base->base.elem_type,
 							args, 2, LLVMReadNoneAttribute);
 					}
@@ -381,7 +381,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						adjusted_elements, 4);
 					args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 					args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-					build_intrinsic(
+					lp_build_intrinsic(
 						base->gallivm->builder,
 						"llvm.R600.store.swizzle",
 						LLVMVoidTypeInContext(base->gallivm->context),
@@ -394,14 +394,14 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
 					args, 3, 0);
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -418,7 +418,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = lp_build_gather_values(base->gallivm, elements, 4);
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -430,7 +430,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 				args[0] = output;
 				args[1] = lp_build_const_int32(base->gallivm, next_param++);
 				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				build_intrinsic(
+				lp_build_intrinsic(
 					base->gallivm->builder,
 					"llvm.R600.store.swizzle",
 					LLVMVoidTypeInContext(base->gallivm->context),
@@ -449,7 +449,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 						for (unsigned j = 0; j < ctx->color_buffer_count; j++) {
 							args[1] = lp_build_const_int32(base->gallivm, j);
 							args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-							build_intrinsic(
+							lp_build_intrinsic(
 								base->gallivm->builder,
 								"llvm.R600.store.swizzle",
 								LLVMVoidTypeInContext(base->gallivm->context),
@@ -458,7 +458,7 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 					} else {
 						args[1] = lp_build_const_int32(base->gallivm, color_count++);
 						args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-						build_intrinsic(
+						lp_build_intrinsic(
 							base->gallivm->builder,
 							"llvm.R600.store.swizzle",
 							LLVMVoidTypeInContext(base->gallivm->context),
@@ -543,7 +543,7 @@ static void llvm_emit_tex(
 		case TGSI_OPCODE_TXF: {
 			args[0] = LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], lp_build_const_int32(gallivm, 0), "");
 			args[1] = lp_build_const_int32(gallivm, R600_MAX_CONST_BUFFERS);
-			emit_data->output[0] = build_intrinsic(gallivm->builder,
+			emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
 							"llvm.R600.load.texbuf",
 							emit_data->dst_type, args, 2, LLVMReadNoneAttribute);
 			if (ctx->chip_class >= EVERGREEN)
@@ -658,7 +658,7 @@ static void llvm_emit_tex(
 				lp_build_const_int32(gallivm, 1),
 				lp_build_const_int32(gallivm, 1)
 			};
-			LLVMValueRef ptr = build_intrinsic(gallivm->builder,
+			LLVMValueRef ptr = lp_build_intrinsic(gallivm->builder,
 				"llvm.R600.ldptr",
 				emit_data->dst_type, ldptr_args, 10, LLVMReadNoneAttribute);
 			LLVMValueRef Tmp = LLVMBuildExtractElement(gallivm->builder, args[0],
@@ -679,7 +679,7 @@ static void llvm_emit_tex(
 		}
 	}
 
-	emit_data->output[0] = build_intrinsic(gallivm->builder,
+	emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
 					action->intr_name,
 					emit_data->dst_type, args, c, LLVMReadNoneAttribute);
 
@@ -754,7 +754,131 @@ static struct lp_build_tgsi_action dot_action = {
 	.intr_name = "llvm.AMDGPU.dp4"
 };
 
+static void txd_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
 
+	LLVMValueRef coords[4];
+	unsigned chan, src;
+	for (src = 0; src < 3; src++) {
+		for (chan = 0; chan < 4; chan++)
+			coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan);
+
+		emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm,
+				coords, 4);
+	}
+	emit_data->arg_count = 3;
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+}
+
+
+static void txp_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	LLVMValueRef src_w;
+	unsigned chan;
+	LLVMValueRef coords[5];
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
+
+	for (chan = 0; chan < 3; chan++ ) {
+		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
+						emit_data->inst, 0, chan);
+		coords[chan] = lp_build_emit_llvm_binary(bld_base,
+					TGSI_OPCODE_DIV, arg, src_w);
+	}
+	coords[3] = bld_base->base.one;
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
+	}
+
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->arg_count = 1;
+}
+
+static void tex_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+
+	LLVMValueRef coords[5];
+	unsigned chan;
+	for (chan = 0; chan < 4; chan++) {
+		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
+	}
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
+		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
+		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
+		/* These instructions have additional operand that should be packed
+		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
+		 * That operand should be passed as a float value in the args array
+		 * right after the coord vector. After packing it's not used anymore,
+		 * that's why arg_count is not increased */
+		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
+	}
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
+	}
+
+	emit_data->arg_count = 1;
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+}
+
+static void txf_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+	const struct tgsi_texture_offset * off = inst->TexOffsets;
+	LLVMTypeRef offset_type = bld_base->int_bld.elem_type;
+
+	/* fetch tex coords */
+	tex_fetch_args(bld_base, emit_data);
+
+	/* fetch tex offsets */
+	if (inst->Texture.NumOffsets) {
+		assert(inst->Texture.NumOffsets == 1);
+
+		emit_data->args[1] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleX],
+			offset_type);
+		emit_data->args[2] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleY],
+			offset_type);
+		emit_data->args[3] = LLVMConstBitCast(
+			bld->immediates[off->Index][off->SwizzleZ],
+			offset_type);
+	} else {
+		emit_data->args[1] = bld_base->int_bld.zero;
+		emit_data->args[2] = bld_base->int_bld.zero;
+		emit_data->args[3] = bld_base->int_bld.zero;
+	}
+
+	emit_data->arg_count = 4;
+}
 
 LLVMModuleRef r600_tgsi_llvm(
 	struct radeon_llvm_context * ctx,
@@ -783,7 +907,6 @@ LLVMModuleRef r600_tgsi_llvm(
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const;
 	bld_base->emit_prologue = llvm_emit_prologue;
 	bld_base->emit_epilogue = llvm_emit_epilogue;
-	ctx->userdata = ctx;
 	ctx->load_input = llvm_load_input;
 	ctx->load_system_value = llvm_load_system_value;
 
@@ -791,18 +914,42 @@ LLVMModuleRef r600_tgsi_llvm(
 	bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action;
+	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
+	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
+	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TEX2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
 	bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb";
 	bld_base->op_actions[TGSI_OPCODE_TXB2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
 	bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
 	bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
+	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl";
+	bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
+	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cndlt;
 
 	lp_build_tgsi_llvm(bld_base, tokens);
@@ -881,7 +1028,7 @@ unsigned r600_llvm_compile(
 	const char * gpu_family = r600_get_llvm_processor_name(family);
 
 	memset(&binary, 0, sizeof(struct radeon_shader_binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, NULL);
+	r = radeon_llvm_compile(mod, &binary, gpu_family, dump, dump, NULL);
 
 	r = r600_create_shader(bc, &binary, use_kill);
 
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index e122b607b86..6ffe5615fbf 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -120,6 +120,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 	rctx->b.b.screen = screen;
 	rctx->b.b.priv = priv;
 	rctx->b.b.destroy = r600_destroy_context;
+	rctx->b.set_atom_dirty = (void *)r600_set_atom_dirty;
 
 	if (!r600_common_context_init(&rctx->b, &rscreen->b))
 		goto fail;
@@ -176,7 +177,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 		goto fail;
 	}
 
-	rctx->b.rings.gfx.cs = ws->cs_create(ws, RING_GFX,
+	rctx->b.rings.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
 					     r600_context_gfx_flush, rctx,
 					     rscreen->b.trace_bo ?
 						     rscreen->b.trace_bo->cs_buf : NULL);
@@ -268,8 +269,14 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_SAMPLE_SHADING:
 	case PIPE_CAP_CLIP_HALFZ:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 		return 1;
 
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+		return rscreen->b.info.drm_major == 2 && rscreen->b.info.drm_minor >= 43;
+
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 		return !R600_BIG_ENDIAN && rscreen->b.info.has_userptr;
 
@@ -329,10 +336,10 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 4ea270d3839..9b66105641a 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -36,7 +36,7 @@
 #include "util/list.h"
 #include "util/u_transfer.h"
 
-#define R600_NUM_ATOMS 73
+#define R600_NUM_ATOMS 75
 
 #define R600_MAX_VIEWPORTS 16
 
@@ -85,6 +85,9 @@
 #define R600_BIG_ENDIAN 0
 #endif
 
+#define R600_DIRTY_ATOM_WORD_BITS (sizeof(unsigned long) * 8)
+#define R600_DIRTY_ATOM_ARRAY_LEN DIV_ROUND_UP(R600_NUM_ATOMS, R600_DIRTY_ATOM_WORD_BITS)
+
 struct r600_context;
 struct r600_bytecode;
 struct r600_shader_key;
@@ -426,6 +429,8 @@ struct r600_context {
 
 	/* State binding slots are here. */
 	struct r600_atom		*atoms[R600_NUM_ATOMS];
+	/* Dirty atom bitmask for fast tests */
+	unsigned long			dirty_atoms[R600_DIRTY_ATOM_ARRAY_LEN];
 	/* States for CS initialization. */
 	struct r600_command_buffer	start_cs_cmd; /* invariant state mostly */
 	/** Compute specific registers initializations.  The start_cs_cmd atom
@@ -490,37 +495,92 @@ struct r600_context {
 	struct r600_isa		*isa;
 };
 
-static INLINE void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
+static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
 					    struct r600_command_buffer *cb)
 {
-	assert(cs->cdw + cb->num_dw <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw + cb->num_dw <= cs->max_dw);
 	memcpy(cs->buf + cs->cdw, cb->buf, 4 * cb->num_dw);
 	cs->cdw += cb->num_dw;
 }
 
+static inline void r600_set_atom_dirty(struct r600_context *rctx,
+				       struct r600_atom *atom,
+				       bool dirty)
+{
+	unsigned long mask;
+	unsigned int w;
+
+	atom->dirty = dirty;
+
+	assert(atom->id != 0);
+	w = atom->id / R600_DIRTY_ATOM_WORD_BITS;
+	mask = 1ul << (atom->id % R600_DIRTY_ATOM_WORD_BITS);
+	if (dirty)
+		rctx->dirty_atoms[w] |= mask;
+	else
+		rctx->dirty_atoms[w] &= ~mask;
+}
+
+static inline void r600_mark_atom_dirty(struct r600_context *rctx,
+					struct r600_atom *atom)
+{
+	r600_set_atom_dirty(rctx, atom, true);
+}
+
+static inline unsigned int r600_next_dirty_atom(struct r600_context *rctx,
+						unsigned int id)
+{
+#if !defined(DEBUG) && defined(HAVE___BUILTIN_CTZ)
+	unsigned int w = id / R600_DIRTY_ATOM_WORD_BITS;
+	unsigned int bit = id % R600_DIRTY_ATOM_WORD_BITS;
+	unsigned long bits, mask = (1ul << bit) - 1;
+
+	for (; w < R600_DIRTY_ATOM_ARRAY_LEN; w++, mask = 0ul) {
+		bits = rctx->dirty_atoms[w] & ~mask;
+		if (bits == 0)
+			continue;
+		return w * R600_DIRTY_ATOM_WORD_BITS + __builtin_ctzl(bits);
+	}
+
+	return R600_NUM_ATOMS;
+#else
+	for (; id < R600_NUM_ATOMS; id++) {
+		bool dirty = !!(rctx->dirty_atoms[id / R600_DIRTY_ATOM_WORD_BITS] &
+			(1ul << (id % R600_DIRTY_ATOM_WORD_BITS)));
+		assert(dirty == (rctx->atoms[id] && rctx->atoms[id]->dirty));
+		if (dirty)
+			break;
+	}
+
+	return id;
+#endif
+}
+
 void r600_trace_emit(struct r600_context *rctx);
 
-static INLINE void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
+static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
 {
 	atom->emit(&rctx->b, atom);
-	atom->dirty = false;
+	r600_set_atom_dirty(rctx, atom, false);
 	if (rctx->screen->b.trace_bo) {
 		r600_trace_emit(rctx);
 	}
 }
 
-static INLINE void r600_set_cso_state(struct r600_cso_state *state, void *cso)
+static inline void r600_set_cso_state(struct r600_context *rctx,
+				      struct r600_cso_state *state, void *cso)
 {
 	state->cso = cso;
-	state->atom.dirty = cso != NULL;
+	r600_set_atom_dirty(rctx, &state->atom, cso != NULL);
 }
 
-static INLINE void r600_set_cso_state_with_cb(struct r600_cso_state *state, void *cso,
+static inline void r600_set_cso_state_with_cb(struct r600_context *rctx,
+					      struct r600_cso_state *state, void *cso,
 					      struct r600_command_buffer *cb)
 {
 	state->cb = cb;
 	state->atom.num_dw = cb ? cb->num_dw : 0;
-	r600_set_cso_state(state, cso);
+	r600_set_cso_state(rctx, state, cso);
 }
 
 /* compute_memory_pool.c */
@@ -529,11 +589,6 @@ void compute_memory_pool_delete(struct compute_memory_pool* pool);
 struct compute_memory_pool* compute_memory_pool_new(
 	struct r600_screen *rscreen);
 
-/* evergreen_compute.c */
-void evergreen_set_cs_sampler_view(struct pipe_context *ctx_,
-                                   unsigned start_slot, unsigned count,
-                                   struct pipe_sampler_view **views);
-
 /* evergreen_state.c */
 struct pipe_sampler_view *
 evergreen_create_sampler_view_custom(struct pipe_context *ctx,
@@ -656,6 +711,7 @@ void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a);
+void r600_add_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id);
 void r600_init_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id,
 		    void (*emit)(struct r600_context *ctx, struct r600_atom *state),
 		    unsigned num_dw);
@@ -719,19 +775,19 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe,
 /*Evergreen Compute packet3*/
 #define PKT3C(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate) | RADEON_CP_PACKET3_COMPUTE_MODE)
 
-static INLINE void r600_store_value(struct r600_command_buffer *cb, unsigned value)
+static inline void r600_store_value(struct r600_command_buffer *cb, unsigned value)
 {
 	cb->buf[cb->num_dw++] = value;
 }
 
-static INLINE void r600_store_array(struct r600_command_buffer *cb, unsigned num, unsigned *ptr)
+static inline void r600_store_array(struct r600_command_buffer *cb, unsigned num, unsigned *ptr)
 {
 	assert(cb->num_dw+num <= cb->max_num_dw);
 	memcpy(&cb->buf[cb->num_dw], ptr, num * sizeof(ptr[0]));
 	cb->num_dw += num;
 }
 
-static INLINE void r600_store_config_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_config_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg < R600_CONTEXT_REG_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -743,7 +799,7 @@ static INLINE void r600_store_config_reg_seq(struct r600_command_buffer *cb, uns
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void r600_store_context_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_context_reg_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CONTEXT_REG_OFFSET && reg < R600_CTL_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -755,7 +811,7 @@ static INLINE void r600_store_context_reg_seq(struct r600_command_buffer *cb, un
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CTL_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -763,7 +819,7 @@ static INLINE void r600_store_ctl_const_seq(struct r600_command_buffer *cb, unsi
 	cb->buf[cb->num_dw++] = (reg - R600_CTL_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void r600_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_LOOP_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -775,7 +831,7 @@ static INLINE void r600_store_loop_const_seq(struct r600_command_buffer *cb, uns
  * Needs cb->pkt_flags set to  RADEON_CP_PACKET3_COMPUTE_MODE for compute
  * shaders.
  */
-static INLINE void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
+static inline void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsigned reg, unsigned num)
 {
 	assert(reg >= EG_LOOP_CONST_OFFSET);
 	assert(cb->num_dw+2+num <= cb->max_num_dw);
@@ -783,31 +839,31 @@ static INLINE void eg_store_loop_const_seq(struct r600_command_buffer *cb, unsig
 	cb->buf[cb->num_dw++] = (reg - EG_LOOP_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_store_config_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_config_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_config_reg_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_context_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_context_reg(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_context_reg_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_ctl_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_ctl_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_ctl_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void r600_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void r600_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	r600_store_loop_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
 }
 
-static INLINE void eg_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
+static inline void eg_store_loop_const(struct r600_command_buffer *cb, unsigned reg, unsigned value)
 {
 	eg_store_loop_const_seq(cb, reg, 1);
 	r600_store_value(cb, value);
@@ -816,28 +872,28 @@ static INLINE void eg_store_loop_const(struct r600_command_buffer *cb, unsigned
 void r600_init_command_buffer(struct r600_command_buffer *cb, unsigned num_dw);
 void r600_release_command_buffer(struct r600_command_buffer *cb);
 
-static INLINE void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_compute_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	r600_write_context_reg_seq(cs, reg, num);
 	/* Set the compute bit on the packet header */
 	cs->buf[cs->cdw - 2] |= RADEON_CP_PACKET3_COMPUTE_MODE;
 }
 
-static INLINE void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CTL_CONST_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	cs->buf[cs->cdw++] = PKT3(PKT3_SET_CTL_CONST, num, 0);
 	cs->buf[cs->cdw++] = (reg - R600_CTL_CONST_OFFSET) >> 2;
 }
 
-static INLINE void r600_write_compute_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_compute_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_compute_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag)
+static inline void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag)
 {
 	if (flag & RADEON_CP_PACKET3_COMPUTE_MODE) {
 		r600_write_compute_context_reg(cs, reg, value);
@@ -846,7 +902,7 @@ static INLINE void r600_write_context_reg_flag(struct radeon_winsys_cs *cs, unsi
 	}
 }
 
-static INLINE void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_ctl_const_seq(cs, reg, 1);
 	radeon_emit(cs, value);
@@ -855,21 +911,21 @@ static INLINE void r600_write_ctl_const(struct radeon_winsys_cs *cs, unsigned re
 /*
  * common helpers
  */
-static INLINE uint32_t S_FIXED(float value, uint32_t frac_bits)
+static inline uint32_t S_FIXED(float value, uint32_t frac_bits)
 {
 	return value * (1 << frac_bits);
 }
 #define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
 
 /* 12.4 fixed-point */
-static INLINE unsigned r600_pack_float_12p4(float x)
+static inline unsigned r600_pack_float_12p4(float x)
 {
 	return x <= 0    ? 0 :
 	       x >= 4096 ? 0xffff : x * 16;
 }
 
 /* Return if the depth format can be read without the DB->CB copy on r6xx-r7xx. */
-static INLINE bool r600_can_read_depth(struct r600_texture *rtex)
+static inline bool r600_can_read_depth(struct r600_texture *rtex)
 {
 	return rtex->resource.b.b.nr_samples <= 1 &&
 	       (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM ||
@@ -880,7 +936,7 @@ static INLINE bool r600_can_read_depth(struct r600_texture *rtex)
 #define     V_028A6C_OUTPRIM_TYPE_LINESTRIP            1
 #define     V_028A6C_OUTPRIM_TYPE_TRISTRIP             2
 
-static INLINE unsigned r600_conv_prim_to_gs_out(unsigned mode)
+static inline unsigned r600_conv_prim_to_gs_out(unsigned mode)
 {
 	static const int prim_conv[] = {
 		V_028A6C_OUTPRIM_TYPE_POINTLIST,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index af7622e9b34..8d1f95abddc 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -310,6 +310,7 @@ struct r600_shader_ctx {
 	int					gs_next_vertex;
 	struct r600_shader	*gs_for_vs;
 	int					gs_export_gpr_treg;
+	unsigned				enabled_stream_buffers_mask;
 };
 
 struct r600_shader_tgsi_instruction {
@@ -1402,6 +1403,9 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
 		 * with MEM_STREAM instructions */
 		output.array_size = 0xFFF;
 		output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
+
+		ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer);
+
 		if (ctx->bc->chip_class >= EVERGREEN) {
 			switch (so->output[i].output_buffer) {
 			case 0:
@@ -1718,6 +1722,8 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
 	gs->gs_copy_shader = cshader;
 
 	ctx.bc->nstack = 1;
+
+	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
 	cshader->shader.ring_item_size = ocnt * 16;
 
 	return r600_bytecode_build(ctx.bc);
@@ -1931,15 +1937,14 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
 	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
 			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
+	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
+	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
+
 	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
-		ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1;
-		ctx.temp_reg = ctx.bc->ar_reg + 2;
-		ctx.bc->index_reg[0] = ctx.bc->ar_reg + 3;
-		ctx.bc->index_reg[1] = ctx.bc->ar_reg + 4;
+		ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 3;
+		ctx.temp_reg = ctx.bc->ar_reg + 4;
 	} else {
-		ctx.temp_reg = ctx.bc->ar_reg + 1;
-		ctx.bc->index_reg[0] = ctx.bc->ar_reg + 2;
-		ctx.bc->index_reg[1] = ctx.bc->ar_reg + 3;
+		ctx.temp_reg = ctx.bc->ar_reg + 3;
 	}
 
 	shader->max_arrays = 0;
@@ -2086,7 +2091,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
 		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
 		radeon_llvm_ctx.stream_outputs = &so;
-		radeon_llvm_ctx.clip_vertex = ctx.cv_output;
 		radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
 		radeon_llvm_ctx.has_compressed_msaa_texturing =
 			ctx.bc->has_compressed_msaa_texturing;
@@ -2262,6 +2266,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	    so.num_outputs && !use_llvm)
 		emit_streamout(&ctx, &so);
 
+	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
 	convert_edgeflag_to_int(&ctx);
 
 	if (ring_outputs) {
@@ -2485,6 +2490,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			output[j].array_base = 0;
 			output[j].op = CF_OP_EXPORT;
 			j++;
+			shader->nr_ps_color_exports++;
 		}
 
 		noutput = j;
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index dd359d7e959..5d05c8153d7 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -125,6 +125,7 @@ struct r600_pipe_shader {
 	struct r600_shader_key	key;
 	unsigned		db_shader_control;
 	unsigned		ps_depth_export;
+	unsigned		enabled_stream_buffers_mask;
 };
 
 /* return the table index 0-5 for TGSI_INTERPOLATE_LINEAR/PERSPECTIVE and
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 960dfcedfef..5cc2283792d 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -473,7 +473,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 	rs->offset_enable = state->offset_point || state->offset_line || state->offset_tri;
 
 	if (state->point_size_per_vertex) {
@@ -802,7 +802,7 @@ static void r600_set_scissor_states(struct pipe_context *ctx,
 		return;
 
 	for (i = start_slot ; i < start_slot + num_scissors; i++) {
-		rctx->scissor[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[i].atom);
 	}
 }
 
@@ -1193,7 +1193,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (rctx->alphatest_state.bypass != alphatest_bypass) {
 			rctx->alphatest_state.bypass = alphatest_bypass;
-			rctx->alphatest_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 		}
 	}
 
@@ -1209,28 +1209,28 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 		if (state->zsbuf->format != rctx->poly_offset_state.zs_format) {
 			rctx->poly_offset_state.zs_format = state->zsbuf->format;
-			rctx->poly_offset_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 		}
 
 		if (rctx->db_state.rsurf != surf) {
 			rctx->db_state.rsurf = surf;
-			rctx->db_state.atom.dirty = true;
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	} else if (rctx->db_state.rsurf) {
 		rctx->db_state.rsurf = NULL;
-		rctx->db_state.atom.dirty = true;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_state.atom);
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 
 	if (rctx->cb_misc_state.nr_cbufs != state->nr_cbufs) {
 		rctx->cb_misc_state.nr_cbufs = state->nr_cbufs;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	if (state->nr_cbufs == 0 && rctx->alphatest_state.bypass) {
 		rctx->alphatest_state.bypass = false;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 
 	/* Calculate the CS size. */
@@ -1250,7 +1250,7 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 		rctx->framebuffer.atom.num_dw += 2;
 	}
 
-	rctx->framebuffer.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
 
 	r600_set_sample_locations_constant_buffer(rctx);
 }
@@ -1541,9 +1541,9 @@ static void r600_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 
 	rctx->ps_iter_samples = min_samples;
 	if (rctx->framebuffer.nr_samples > 1) {
-		rctx->rasterizer_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->rasterizer_state.atom);
 		if (rctx->b.chip_class == R600)
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
@@ -2089,7 +2089,7 @@ bool r600_adjust_gprs(struct r600_context *rctx)
 	if (rctx->config_state.sq_gpr_resource_mgmt_1 != tmp || rctx->config_state.sq_gpr_resource_mgmt_2 != tmp2) {
 		rctx->config_state.sq_gpr_resource_mgmt_1 = tmp;
 		rctx->config_state.sq_gpr_resource_mgmt_2 = tmp2;
-		rctx->config_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->config_state.atom);
 		rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE;
 	}
 	return true;
@@ -2796,11 +2796,11 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 
 	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
-static INLINE unsigned r600_array_mode(unsigned mode)
+static inline unsigned r600_array_mode(unsigned mode)
 {
 	switch (mode) {
 	case RADEON_SURF_MODE_LINEAR_ALIGNED:	return V_0280A0_ARRAY_LINEAR_ALIGNED;
@@ -3074,8 +3074,8 @@ void r600_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5);
-	rctx->atoms[id++] = &rctx->b.streamout.begin_atom;
-	rctx->atoms[id++] = &rctx->b.streamout.enable_atom;
+	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
+	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
 	r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0);
 	r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 13dc9ee8c10..aa4a8d0240f 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -47,18 +47,26 @@ void r600_release_command_buffer(struct r600_command_buffer *cb)
 	FREE(cb->buf);
 }
 
+void r600_add_atom(struct r600_context *rctx,
+		   struct r600_atom *atom,
+		   unsigned id)
+{
+	assert(id < R600_NUM_ATOMS);
+	assert(rctx->atoms[id] == NULL);
+	rctx->atoms[id] = atom;
+	atom->id = id;
+	atom->dirty = false;
+}
+
 void r600_init_atom(struct r600_context *rctx,
 		    struct r600_atom *atom,
 		    unsigned id,
 		    void (*emit)(struct r600_context *ctx, struct r600_atom *state),
 		    unsigned num_dw)
 {
-	assert(id < R600_NUM_ATOMS);
-	assert(rctx->atoms[id] == NULL);
-	rctx->atoms[id] = atom;
 	atom->emit = (void*)emit;
 	atom->num_dw = num_dw;
-	atom->dirty = false;
+	r600_add_atom(rctx, atom, id);
 }
 
 void r600_emit_cso_state(struct r600_context *rctx, struct r600_atom *atom)
@@ -127,11 +135,11 @@ static void r600_bind_blend_state_internal(struct r600_context *rctx,
 	rctx->dual_src_blend = blend->dual_src_blend;
 
 	if (!blend_disable) {
-		r600_set_cso_state_with_cb(&rctx->blend_state, blend, &blend->buffer);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, blend, &blend->buffer);
 		color_control = blend->cb_color_control;
 	} else {
 		/* Blending is disabled. */
-		r600_set_cso_state_with_cb(&rctx->blend_state, blend, &blend->buffer_no_blend);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, blend, &blend->buffer_no_blend);
 		color_control = blend->cb_color_control_no_blend;
 	}
 
@@ -150,7 +158,7 @@ static void r600_bind_blend_state_internal(struct r600_context *rctx,
 		update_cb = true;
 	}
 	if (update_cb) {
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 }
 
@@ -160,7 +168,7 @@ static void r600_bind_blend_state(struct pipe_context *ctx, void *state)
 	struct r600_blend_state *blend = (struct r600_blend_state *)state;
 
 	if (blend == NULL) {
-		r600_set_cso_state_with_cb(&rctx->blend_state, NULL, NULL);
+		r600_set_cso_state_with_cb(rctx, &rctx->blend_state, NULL, NULL);
 		return;
 	}
 
@@ -173,7 +181,7 @@ static void r600_set_blend_color(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->blend_color.state = *state;
-	rctx->blend_color.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->blend_color.atom);
 }
 
 void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom)
@@ -210,7 +218,7 @@ static void r600_set_clip_state(struct pipe_context *ctx,
 	struct pipe_constant_buffer cb;
 
 	rctx->clip_state.state = *state;
-	rctx->clip_state.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->clip_state.atom);
 
 	cb.buffer = NULL;
 	cb.user_buffer = state->ucp;
@@ -226,7 +234,7 @@ static void r600_set_stencil_ref(struct pipe_context *ctx,
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->stencil_ref.state = *state;
-	rctx->stencil_ref.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->stencil_ref.atom);
 }
 
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom)
@@ -274,11 +282,11 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 	struct r600_stencil_ref ref;
 
 	if (state == NULL) {
-		r600_set_cso_state_with_cb(&rctx->dsa_state, NULL, NULL);
+		r600_set_cso_state_with_cb(rctx, &rctx->dsa_state, NULL, NULL);
 		return;
 	}
 
-	r600_set_cso_state_with_cb(&rctx->dsa_state, dsa, &dsa->buffer);
+	r600_set_cso_state_with_cb(rctx, &rctx->dsa_state, dsa, &dsa->buffer);
 
 	ref.ref_value[0] = rctx->stencil_ref.pipe_state.ref_value[0];
 	ref.ref_value[1] = rctx->stencil_ref.pipe_state.ref_value[1];
@@ -293,7 +301,7 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 			 * we are having lockup on evergreen so do not enable
 			 * hyperz when not writing zbuffer
 			 */
-			rctx->db_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 		}
 	}
 
@@ -304,7 +312,7 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 	    rctx->alphatest_state.sx_alpha_ref != dsa->alpha_ref) {
 		rctx->alphatest_state.sx_alpha_test_control = dsa->sx_alpha_test_control;
 		rctx->alphatest_state.sx_alpha_ref = dsa->alpha_ref;
-		rctx->alphatest_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->alphatest_state.atom);
 	}
 }
 
@@ -318,14 +326,14 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 
 	rctx->rasterizer = rs;
 
-	r600_set_cso_state_with_cb(&rctx->rasterizer_state, rs, &rs->buffer);
+	r600_set_cso_state_with_cb(rctx, &rctx->rasterizer_state, rs, &rs->buffer);
 
 	if (rs->offset_enable &&
 	    (rs->offset_units != rctx->poly_offset_state.offset_units ||
 	     rs->offset_scale != rctx->poly_offset_state.offset_scale)) {
 		rctx->poly_offset_state.offset_units = rs->offset_units;
 		rctx->poly_offset_state.offset_scale = rs->offset_scale;
-		rctx->poly_offset_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
 	}
 
 	/* Update clip_misc_state. */
@@ -333,14 +341,14 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 	    rctx->clip_misc_state.clip_plane_enable != rs->clip_plane_enable) {
 		rctx->clip_misc_state.pa_cl_clip_cntl = rs->pa_cl_clip_cntl;
 		rctx->clip_misc_state.clip_plane_enable = rs->clip_plane_enable;
-		rctx->clip_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 	}
 
 	/* Workaround for a missing scissor enable on r600. */
 	if (rctx->b.chip_class == R600 &&
 	    rs->scissor_enable != rctx->scissor[0].enable) {
 		rctx->scissor[0].enable = rs->scissor_enable;
-		rctx->scissor[0].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->scissor[0].atom);
 	}
 
 	/* Re-emit PA_SC_LINE_STIPPLE. */
@@ -378,7 +386,7 @@ void r600_sampler_states_dirty(struct r600_context *rctx,
 		state->atom.num_dw =
 			util_bitcount(state->dirty_mask & state->has_bordercolor_mask) * 11 +
 			util_bitcount(state->dirty_mask & ~state->has_bordercolor_mask) * 5;
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -399,9 +407,9 @@ static void r600_bind_sampler_states(struct pipe_context *pipe,
 
 	assert(start == 0); /* XXX fix below */
 
-	if (shader != PIPE_SHADER_VERTEX &&
-	    shader != PIPE_SHADER_FRAGMENT) {
-		return;
+	if (!states) {
+		disable_mask = ~0u;
+		count = 0;
 	}
 
 	for (i = 0; i < count; i++) {
@@ -443,7 +451,7 @@ static void r600_bind_sampler_states(struct pipe_context *pipe,
 		/* change in TA_CNTL_AUX need a pipeline flush */
 		rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE;
 		rctx->seamless_cube_map.enabled = seamless_cube_map;
-		rctx->seamless_cube_map.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->seamless_cube_map.atom);
 	}
 }
 
@@ -483,7 +491,7 @@ static void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
-	r600_set_cso_state(&rctx->vertex_fetch_shader, state);
+	r600_set_cso_state(rctx, &rctx->vertex_fetch_shader, state);
 }
 
 static void r600_delete_vertex_elements(struct pipe_context *ctx, void *state)
@@ -513,7 +521,7 @@ void r600_vertex_buffers_dirty(struct r600_context *rctx)
 		rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
 		rctx->vertex_buffer_state.atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 12 : 11) *
 					       util_bitcount(rctx->vertex_buffer_state.dirty_mask);
-		rctx->vertex_buffer_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->vertex_buffer_state.atom);
 	}
 }
 
@@ -570,7 +578,7 @@ void r600_sampler_views_dirty(struct r600_context *rctx,
 		rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
 		state->atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 14 : 13) *
 				     util_bitcount(state->dirty_mask);
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -593,9 +601,9 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 
 	assert(start == 0); /* XXX fix below */
 
-	if (shader == PIPE_SHADER_COMPUTE) {
-		evergreen_set_cs_sampler_view(pipe, start, count, views);
-		return;
+	if (!views) {
+		disable_mask = ~0u;
+		count = 0;
 	}
 
 	remaining_mask = dst->views.enabled_mask & disable_mask;
@@ -673,7 +681,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx,
 
 	for (i = start_slot; i < start_slot + num_viewports; i++) {
 		rctx->viewport[i].state = state[i - start_slot];
-		rctx->viewport[i].atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->viewport[i].atom);
 	}
 }
 
@@ -694,7 +702,7 @@ void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
 }
 
 /* Compute the key for the hw shader variant */
-static INLINE struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
+static inline struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
 		struct r600_pipe_shader_selector * sel)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
@@ -913,7 +921,7 @@ void r600_constant_buffers_dirty(struct r600_context *rctx, struct r600_constbuf
 		rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE;
 		state->atom.num_dw = rctx->b.chip_class >= EVERGREEN ? util_bitcount(state->dirty_mask)*20
 								   : util_bitcount(state->dirty_mask)*19;
-		state->atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &state->atom);
 	}
 }
 
@@ -982,7 +990,7 @@ static void r600_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask
 		return;
 
 	rctx->sample_mask.sample_mask = sample_mask;
-	rctx->sample_mask.atom.dirty = true;
+	r600_mark_atom_dirty(rctx, &rctx->sample_mask.atom);
 }
 
 /*
@@ -1107,27 +1115,28 @@ static void update_shader_atom(struct pipe_context *ctx,
 			       struct r600_shader_state *state,
 			       struct r600_pipe_shader *shader)
 {
+	struct r600_context *rctx = (struct r600_context *)ctx;
+
 	state->shader = shader;
 	if (shader) {
 		state->atom.num_dw = shader->command_buffer.num_dw;
-		state->atom.dirty = true;
 		r600_context_add_resource_size(ctx, (struct pipe_resource *)shader->bo);
 	} else {
 		state->atom.num_dw = 0;
-		state->atom.dirty = false;
 	}
+	r600_mark_atom_dirty(rctx, &state->atom);
 }
 
 static void update_gs_block_state(struct r600_context *rctx, unsigned enable)
 {
 	if (rctx->shader_stages.geom_enable != enable) {
 		rctx->shader_stages.geom_enable = enable;
-		rctx->shader_stages.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 	}
 
 	if (rctx->gs_rings.enable != enable) {
 		rctx->gs_rings.enable = enable;
-		rctx->gs_rings.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->gs_rings.atom);
 
 		if (enable && !rctx->gs_rings.esgs_ring.buffer) {
 			unsigned size = 0x1C000;
@@ -1192,7 +1201,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 		if (!rctx->shader_stages.geom_enable) {
 			rctx->shader_stages.geom_enable = true;
-			rctx->shader_stages.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		}
 
 		/* gs_shader provides GS and VS (copy shader) */
@@ -1206,8 +1215,9 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				rctx->clip_misc_state.pa_cl_vs_out_cntl = rctx->gs_shader->current->gs_copy_shader->pa_cl_vs_out_cntl;
 				rctx->clip_misc_state.clip_dist_write = rctx->gs_shader->current->gs_copy_shader->shader.clip_dist_write;
 				rctx->clip_misc_state.clip_disable = rctx->gs_shader->current->shader.vs_position_window_space;
-				rctx->clip_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 			}
+			rctx->b.streamout.enabled_stream_buffers_mask = rctx->gs_shader->current->gs_copy_shader->enabled_stream_buffers_mask;
 		}
 
 		r600_shader_select(ctx, rctx->vs_shader, &vs_dirty);
@@ -1223,7 +1233,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 			update_shader_atom(ctx, &rctx->geometry_shader, NULL);
 			update_shader_atom(ctx, &rctx->export_shader, NULL);
 			rctx->shader_stages.geom_enable = false;
-			rctx->shader_stages.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		}
 
 		r600_shader_select(ctx, rctx->vs_shader, &vs_dirty);
@@ -1240,8 +1250,9 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				rctx->clip_misc_state.pa_cl_vs_out_cntl = rctx->vs_shader->current->pa_cl_vs_out_cntl;
 				rctx->clip_misc_state.clip_dist_write = rctx->vs_shader->current->shader.clip_dist_write;
 				rctx->clip_misc_state.clip_disable = rctx->vs_shader->current->shader.vs_position_window_space;
-				rctx->clip_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 			}
+			rctx->b.streamout.enabled_stream_buffers_mask = rctx->vs_shader->current->enabled_stream_buffers_mask;
 		}
 	}
 
@@ -1252,7 +1263,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 		if (rctx->cb_misc_state.nr_ps_color_outputs != rctx->ps_shader->current->nr_ps_color_outputs) {
 			rctx->cb_misc_state.nr_ps_color_outputs = rctx->ps_shader->current->nr_ps_color_outputs;
-			rctx->cb_misc_state.atom.dirty = true;
+			r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 		}
 
 		if (rctx->b.chip_class <= R700) {
@@ -1260,7 +1271,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 			if (rctx->cb_misc_state.multiwrite != multiwrite) {
 				rctx->cb_misc_state.multiwrite = multiwrite;
-				rctx->cb_misc_state.atom.dirty = true;
+				r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 			}
 		}
 
@@ -1274,7 +1285,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 				r600_update_ps_state(ctx, rctx->ps_shader->current);
 		}
 
-		rctx->shader_stages.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->shader_stages.atom);
 		update_shader_atom(ctx, &rctx->pixel_shader, rctx->ps_shader->current);
 	}
 
@@ -1409,7 +1420,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 					data += info.indirect_offset / sizeof(unsigned);
 					start = data[2] * ib.index_size;
 					count = data[0];
-					rctx->b.ws->buffer_unmap(indirect_resource->cs_buf);
 				}
 				else {
 					start = 0;
@@ -1454,24 +1464,23 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		rctx->vgt_state.vgt_multi_prim_ib_reset_en = info.primitive_restart;
 		rctx->vgt_state.vgt_multi_prim_ib_reset_indx = info.restart_index;
 		rctx->vgt_state.vgt_indx_offset = info.index_bias;
-		rctx->vgt_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->vgt_state.atom);
 	}
 
 	/* Workaround for hardware deadlock on certain R600 ASICs: write into a CB register. */
 	if (rctx->b.chip_class == R600) {
 		rctx->b.flags |= R600_CONTEXT_PS_PARTIAL_FLUSH;
-		rctx->cb_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->cb_misc_state.atom);
 	}
 
 	/* Emit states. */
 	r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE);
 	r600_flush_emit(rctx);
 
-	for (i = 0; i < R600_NUM_ATOMS; i++) {
-		if (rctx->atoms[i] == NULL || !rctx->atoms[i]->dirty) {
-			continue;
-		}
+	i = r600_next_dirty_atom(rctx, 0);
+	while (i < R600_NUM_ATOMS) {
 		r600_emit_atom(rctx, rctx->atoms[i]);
+		i = r600_next_dirty_atom(rctx, i + 1);
 	}
 
 	if (rctx->b.chip_class == CAYMAN) {
@@ -2490,7 +2499,7 @@ static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable
 
 	if (rctx->db_misc_state.occlusion_query_enabled != enable) {
 		rctx->db_misc_state.occlusion_query_enabled = enable;
-		rctx->db_misc_state.atom.dirty = true;
+		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
index 2e38a62c05a..62680788c5e 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -489,7 +489,7 @@ bool alu_group_tracker::try_reserve(alu_node* n) {
 
 	n->bc.bank_swizzle = 0;
 
-	if (!trans & fbs)
+	if (!trans && fbs)
 		n->bc.bank_swizzle = VEC_210;
 
 	if (gpr.try_reserve(n)) {
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index fc5f6c29870..cb9809f2449 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -84,7 +84,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		}
 	}
 
-	if (busy || ctx->ws->buffer_is_busy(resource->buf, rusage)) {
+	if (busy || !ctx->ws->buffer_wait(resource->buf, 0, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
 			return NULL;
 		} else {
@@ -121,7 +121,8 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		/* Older kernels didn't always flush the HDP cache before
 		 * CS execution
 		 */
-		if (rscreen->info.drm_minor < 40) {
+		if (rscreen->info.drm_major == 2 &&
+		    rscreen->info.drm_minor < 40) {
 			res->domains = RADEON_DOMAIN_GTT;
 			flags |= RADEON_FLAG_GTT_WC;
 			break;
@@ -147,7 +148,8 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		 * Write-combined CPU mappings are fine, the kernel ensures all CPU
 		 * writes finish before the GPU executes a command stream.
 		 */
-		if (rscreen->info.drm_minor < 40)
+		if (rscreen->info.drm_major == 2 &&
+		    rscreen->info.drm_minor < 40)
 			res->domains = RADEON_DOMAIN_GTT;
 		else if (res->domains & RADEON_DOMAIN_VRAM)
 			flags |= RADEON_FLAG_CPU_ACCESS;
@@ -161,6 +163,9 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 		flags |= RADEON_FLAG_NO_CPU_ACCESS;
 	}
 
+	if (rscreen->debug_flags & DBG_NO_WC)
+		flags &= ~RADEON_FLAG_GTT_WC;
+
 	/* Allocate a new resource. */
 	new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment,
 					     use_reusable_pool,
@@ -274,7 +279,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
 		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
-		    rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
+		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
 		}
 		/* At this point, the buffer is always idle. */
@@ -288,7 +293,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
 		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
-		    rctx->ws->buffer_is_busy(rbuffer->buf, RADEON_USAGE_READWRITE)) {
+		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			/* Do a wait-free write-only transfer using a temporary buffer. */
 			unsigned offset;
 			struct r600_resource *staging = NULL;
diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h
index b51eebbc68e..03a04b754d6 100644
--- a/src/gallium/drivers/radeon/r600_cs.h
+++ b/src/gallium/drivers/radeon/r600_cs.h
@@ -33,7 +33,7 @@
 #include "r600_pipe_common.h"
 #include "r600d_common.h"
 
-static INLINE unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
+static inline unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
 					     struct r600_ring *ring,
 					     struct r600_resource *rbo,
 					     enum radeon_bo_usage usage,
@@ -59,7 +59,7 @@ static INLINE unsigned r600_context_bo_reloc(struct r600_common_context *rctx,
 				      rbo->domains, priority) * 4;
 }
 
-static INLINE void r600_emit_reloc(struct r600_common_context *rctx,
+static inline void r600_emit_reloc(struct r600_common_context *rctx,
 				   struct r600_ring *ring, struct r600_resource *rbo,
 				   enum radeon_bo_usage usage,
 				   enum radeon_bo_priority priority)
@@ -74,57 +74,57 @@ static INLINE void r600_emit_reloc(struct r600_common_context *rctx,
 	}
 }
 
-static INLINE void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg < R600_CONTEXT_REG_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
 	radeon_emit(cs, (reg - R600_CONFIG_REG_OFFSET) >> 2);
 }
 
-static INLINE void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_config_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void r600_write_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CONTEXT_REG_OFFSET);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
 	radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2);
 }
 
-static INLINE void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void r600_write_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	r600_write_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void si_write_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
 	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }
 
-static INLINE void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void si_write_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	si_write_sh_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static INLINE void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void cik_write_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
-	assert(cs->cdw+2+num <= RADEON_MAX_CMDBUF_DWORDS);
+	assert(cs->cdw+2+num <= cs->max_dw);
 	radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
 	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }
 
-static INLINE void cik_write_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void cik_write_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
 {
 	cik_write_uconfig_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 3def4446882..ed5d1dabdc3 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -108,9 +108,9 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
 {
 	/* Flush if there's not enough space. */
-	if ((num_dw + ctx->rings.dma.cs->cdw) > RADEON_MAX_CMDBUF_DWORDS) {
+	if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) {
 		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
-		assert((num_dw + ctx->rings.dma.cs->cdw) <= RADEON_MAX_CMDBUF_DWORDS);
+		assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw);
 	}
 }
 
@@ -132,10 +132,11 @@ void r600_preflush_suspend_features(struct r600_common_context *ctx)
 	}
 
 	/* suspend queries */
-	ctx->nontimer_queries_suspended = false;
+	ctx->queries_suspended_for_flush = false;
 	if (ctx->num_cs_dw_nontimer_queries_suspend) {
 		r600_suspend_nontimer_queries(ctx);
-		ctx->nontimer_queries_suspended = true;
+		r600_suspend_timer_queries(ctx);
+		ctx->queries_suspended_for_flush = true;
 	}
 
 	ctx->streamout.suspended = false;
@@ -153,8 +154,9 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 	}
 
 	/* resume queries */
-	if (ctx->nontimer_queries_suspended) {
+	if (ctx->queries_suspended_for_flush) {
 		r600_resume_nontimer_queries(ctx);
+		r600_resume_timer_queries(ctx);
 	}
 
 	/* Re-enable render condition. */
@@ -196,6 +198,19 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags,
 	rctx->rings.dma.flushing = false;
 }
 
+static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	unsigned latest = rctx->ws->query_value(rctx->ws,
+						RADEON_GPU_RESET_COUNTER);
+
+	if (rctx->gpu_reset_counter == latest)
+		return PIPE_NO_RESET;
+
+	rctx->gpu_reset_counter = latest;
+	return PIPE_UNKNOWN_CONTEXT_RESET;
+}
+
 bool r600_common_context_init(struct r600_common_context *rctx,
 			      struct r600_common_screen *rscreen)
 {
@@ -222,6 +237,13 @@ bool r600_common_context_init(struct r600_common_context *rctx,
         rctx->b.memory_barrier = r600_memory_barrier;
 	rctx->b.flush = r600_flush_from_st;
 
+	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
+		rctx->b.get_device_reset_status = r600_get_reset_status;
+		rctx->gpu_reset_counter =
+			rctx->ws->query_value(rctx->ws,
+					      RADEON_GPU_RESET_COUNTER);
+	}
+
 	LIST_INITHEAD(&rctx->texture_buffers);
 
 	r600_init_context_texture_functions(rctx);
@@ -240,8 +262,12 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (!rctx->uploader)
 		return false;
 
+	rctx->ctx = rctx->ws->ctx_create(rctx->ws);
+	if (!rctx->ctx)
+		return false;
+
 	if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
-		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ws, RING_DMA,
+		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 							 r600_flush_dma_ring,
 							 rctx, NULL);
 		rctx->rings.dma.flush = r600_flush_dma_ring;
@@ -252,12 +278,12 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 void r600_common_context_cleanup(struct r600_common_context *rctx)
 {
-	if (rctx->rings.gfx.cs) {
+	if (rctx->rings.gfx.cs)
 		rctx->ws->cs_destroy(rctx->rings.gfx.cs);
-	}
-	if (rctx->rings.dma.cs) {
+	if (rctx->rings.dma.cs)
 		rctx->ws->cs_destroy(rctx->rings.dma.cs);
-	}
+	if (rctx->ctx)
+		rctx->ws->ctx_destroy(rctx->ctx);
 
 	if (rctx->uploader) {
 		u_upload_destroy(rctx->uploader);
@@ -313,6 +339,11 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "gs", DBG_GS, "Print geometry shaders" },
 	{ "ps", DBG_PS, "Print pixel shaders" },
 	{ "cs", DBG_CS, "Print compute shaders" },
+	{ "tcs", DBG_TCS, "Print tessellation control shaders" },
+	{ "tes", DBG_TES, "Print tessellation evaluation shaders" },
+	{ "noir", DBG_NO_IR, "Don't print the LLVM IR"},
+	{ "notgsi", DBG_NO_TGSI, "Don't print the TGSI"},
+	{ "noasm", DBG_NO_ASM, "Don't print disassembled shaders"},
 
 	/* features */
 	{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
@@ -324,6 +355,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "switch_on_eop", DBG_SWITCH_ON_EOP, "Program WD/IA to switch on end-of-packet." },
 	{ "forcedma", DBG_FORCE_DMA, "Use asynchronous DMA for all operations when possible." },
 	{ "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." },
+	{ "nowc", DBG_NO_WC, "Disable GTT write combining" },
 
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
@@ -338,11 +370,9 @@ static const char* r600_get_device_vendor(struct pipe_screen* pscreen)
 	return "AMD";
 }
 
-static const char* r600_get_name(struct pipe_screen* pscreen)
+static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 {
-	struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
-
-	switch (rscreen->family) {
+	switch (rscreen->info.family) {
 	case CHIP_R600: return "AMD R600";
 	case CHIP_RV610: return "AMD RV610";
 	case CHIP_RV630: return "AMD RV630";
@@ -378,10 +408,21 @@ static const char* r600_get_name(struct pipe_screen* pscreen)
 	case CHIP_KABINI: return "AMD KABINI";
 	case CHIP_HAWAII: return "AMD HAWAII";
 	case CHIP_MULLINS: return "AMD MULLINS";
+	case CHIP_TONGA: return "AMD TONGA";
+	case CHIP_ICELAND: return "AMD ICELAND";
+	case CHIP_CARRIZO: return "AMD CARRIZO";
+	case CHIP_FIJI: return "AMD FIJI";
 	default: return "AMD unknown";
 	}
 }
 
+static const char* r600_get_name(struct pipe_screen* pscreen)
+{
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen;
+
+	return rscreen->renderer_string;
+}
+
 static float r600_get_paramf(struct pipe_screen* pscreen,
 			     enum pipe_capf param)
 {
@@ -495,6 +536,10 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 #else
 		return "kabini";
 #endif
+	case CHIP_TONGA: return "tonga";
+	case CHIP_ICELAND: return "iceland";
+	case CHIP_CARRIZO: return "carrizo";
+	case CHIP_FIJI: return "fiji";
 	default: return "";
 	}
 }
@@ -636,6 +681,12 @@ static int r600_get_compute_param(struct pipe_screen *screen,
 		return sizeof(uint32_t);
 	case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
 		break; /* unused */
+	case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+		if (ret) {
+			uint32_t *subgroup_size = ret;
+			*subgroup_size = r600_wavefront_size(rscreen->family);
+		}
+		return sizeof(uint32_t);
 	}
 
         fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
@@ -656,25 +707,33 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct pipe_driver_query_info list[] = {
+		{"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
+		{"num-shaders-created", R600_QUERY_NUM_SHADERS_CREATED, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
 		{"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
-		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}},
+		{"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}},
-		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES},
+		{"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES,
+		 PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
 		{"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
 		{"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
+		{"GPU-load", R600_QUERY_GPU_LOAD, {100}},
 		{"temperature", R600_QUERY_GPU_TEMPERATURE, {100}},
-		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}},
-		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}},
-		{"GPU-load", R600_QUERY_GPU_LOAD, {100}}
+		{"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
+		{"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
 	};
 	unsigned num_queries;
 
 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
 		num_queries = Elements(list);
+	else if (rscreen->info.drm_major == 3)
+		num_queries = Elements(list) - 3;
 	else
-		num_queries = 8;
+		num_queries = Elements(list) - 4;
 
 	if (!info)
 		return num_queries;
@@ -695,14 +754,6 @@ static void r600_fence_reference(struct pipe_screen *screen,
 	rws->fence_reference(ptr, fence);
 }
 
-static boolean r600_fence_signalled(struct pipe_screen *screen,
-				    struct pipe_fence_handle *fence)
-{
-	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
-
-	return rws->fence_wait(rws, fence, 0);
-}
-
 static boolean r600_fence_finish(struct pipe_screen *screen,
 				 struct pipe_fence_handle *fence,
 				 uint64_t timeout)
@@ -837,8 +888,22 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 bool r600_common_screen_init(struct r600_common_screen *rscreen,
 			     struct radeon_winsys *ws)
 {
+	char llvm_string[32] = {};
+
 	ws->query_info(ws, &rscreen->info);
 
+#if HAVE_LLVM
+	snprintf(llvm_string, sizeof(llvm_string),
+		 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
+		 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
+#endif
+
+	snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
+		 "%s (DRM %i.%i.%i%s)",
+		 r600_get_chip_name(rscreen), rscreen->info.drm_major,
+		 rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
+		 llvm_string);
+
 	rscreen->b.get_name = r600_get_name;
 	rscreen->b.get_vendor = r600_get_vendor;
 	rscreen->b.get_device_vendor = r600_get_device_vendor;
@@ -848,7 +913,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->b.get_timestamp = r600_get_timestamp;
 	rscreen->b.fence_finish = r600_fence_finish;
 	rscreen->b.fence_reference = r600_fence_reference;
-	rscreen->b.fence_signalled = r600_fence_signalled;
 	rscreen->b.resource_destroy = u_resource_destroy_vtbl;
 	rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory;
 
@@ -874,7 +938,9 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	pipe_mutex_init(rscreen->aux_context_lock);
 	pipe_mutex_init(rscreen->gpu_load_mutex);
 
-	if (rscreen->info.drm_minor >= 28 && (rscreen->debug_flags & DBG_TRACE_CS)) {
+	if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) ||
+	     rscreen->info.drm_major == 3) &&
+	    (rscreen->debug_flags & DBG_TRACE_CS)) {
 		rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b,
 										PIPE_BIND_CUSTOM,
 										PIPE_USAGE_STAGING,
@@ -922,10 +988,8 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 	pipe_mutex_destroy(rscreen->aux_context_lock);
 	rscreen->aux_context->destroy(rscreen->aux_context);
 
-	if (rscreen->trace_bo) {
-		rscreen->ws->buffer_unmap(rscreen->trace_bo->cs_buf);
+	if (rscreen->trace_bo)
 		pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL);
-	}
 
 	rscreen->ws->destroy(rscreen->ws);
 	FREE(rscreen);
@@ -941,6 +1005,10 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen,
 	switch (tgsi_get_processor_type(tokens)) {
 	case TGSI_PROCESSOR_VERTEX:
 		return (rscreen->debug_flags & DBG_VS) != 0;
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return (rscreen->debug_flags & DBG_TCS) != 0;
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return (rscreen->debug_flags & DBG_TES) != 0;
 	case TGSI_PROCESSOR_GEOMETRY:
 		return (rscreen->debug_flags & DBG_GS) != 0;
 	case TGSI_PROCESSOR_FRAGMENT:
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 6ce81d33ddd..29db1cc4e07 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -59,6 +59,8 @@
 #define R600_QUERY_CURRENT_GPU_SCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 9)
 #define R600_QUERY_CURRENT_GPU_MCLK	(PIPE_QUERY_DRIVER_SPECIFIC + 10)
 #define R600_QUERY_GPU_LOAD		(PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define R600_QUERY_NUM_COMPILATIONS	(PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define R600_QUERY_NUM_SHADERS_CREATED	(PIPE_QUERY_DRIVER_SPECIFIC + 13)
 
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1u << 0)
 #define R600_CONTEXT_PRIVATE_FLAG		(1u << 1)
@@ -79,17 +81,23 @@
 #define DBG_GS			(1 << 7)
 #define DBG_PS			(1 << 8)
 #define DBG_CS			(1 << 9)
+#define DBG_TCS			(1 << 10)
+#define DBG_TES			(1 << 11)
+#define DBG_NO_IR		(1 << 12)
+#define DBG_NO_TGSI		(1 << 13)
+#define DBG_NO_ASM		(1 << 14)
+/* Bits 21-31 are reserved for the r600g driver. */
 /* features */
-#define DBG_NO_ASYNC_DMA	(1 << 10)
-#define DBG_NO_HYPERZ		(1 << 11)
-#define DBG_NO_DISCARD_RANGE	(1 << 12)
-#define DBG_NO_2D_TILING	(1 << 13)
-#define DBG_NO_TILING		(1 << 14)
-#define DBG_SWITCH_ON_EOP	(1 << 15)
-#define DBG_FORCE_DMA		(1 << 16)
-#define DBG_PRECOMPILE		(1 << 17)
-#define DBG_INFO		(1 << 18)
-/* The maximum allowed bit is 20. */
+#define DBG_NO_ASYNC_DMA	(1llu << 32)
+#define DBG_NO_HYPERZ		(1llu << 33)
+#define DBG_NO_DISCARD_RANGE	(1llu << 34)
+#define DBG_NO_2D_TILING	(1llu << 35)
+#define DBG_NO_TILING		(1llu << 36)
+#define DBG_SWITCH_ON_EOP	(1llu << 37)
+#define DBG_FORCE_DMA		(1llu << 38)
+#define DBG_PRECOMPILE		(1llu << 39)
+#define DBG_INFO		(1llu << 40)
+#define DBG_NO_WC		(1llu << 41)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 
@@ -127,9 +135,8 @@ struct radeon_shader_binary {
 	struct radeon_shader_reloc *relocs;
 	unsigned reloc_count;
 
-	/** Set to 1 if the disassembly for this binary has been dumped to
-	 *  stderr. */
-	int disassembled;
+	/** Disassembled shader in a string. */
+	char *disasm_string;
 };
 
 struct r600_resource {
@@ -214,7 +221,6 @@ struct r600_texture {
 	float				depth_clear_value;
 
 	bool				non_disp_tiling; /* R600-Cayman only */
-	unsigned			mipmap_shift;
 };
 
 struct r600_surface {
@@ -236,6 +242,7 @@ struct r600_surface {
 	unsigned cb_color_pitch;	/* EG and later */
 	unsigned cb_color_slice;	/* EG and later */
 	unsigned cb_color_attrib;	/* EG and later */
+	unsigned cb_dcc_control;	/* VI and later */
 	unsigned cb_color_fmask;	/* CB_COLORn_FMASK (EG and later) or CB_COLORn_FRAG (r600) */
 	unsigned cb_color_fmask_slice;	/* EG and later */
 	unsigned cb_color_cmask;	/* CB_COLORn_TILE (r600 only) */
@@ -272,7 +279,7 @@ struct r600_common_screen {
 	enum chip_class			chip_class;
 	struct radeon_info		info;
 	struct r600_tiling_info		tiling_info;
-	unsigned			debug_flags;
+	uint64_t			debug_flags;
 	bool				has_cp_dma;
 	bool				has_streamout;
 
@@ -285,12 +292,23 @@ struct r600_common_screen {
 	uint32_t			*trace_ptr;
 	unsigned			cs_count;
 
+	/* This must be in the screen, because UE4 uses one context for
+	 * compilation and another one for rendering.
+	 */
+	unsigned			num_compilations;
+	/* Along with ST_DEBUG=precompile, this should show if applications
+	 * are loading shaders on demand. This is a monotonic counter.
+	 */
+	unsigned			num_shaders_created;
+
 	/* GPU load thread. */
 	pipe_mutex			gpu_load_mutex;
 	pipe_thread			gpu_load_thread;
 	unsigned			gpu_load_counter_busy;
 	unsigned			gpu_load_counter_idle;
-	unsigned			gpu_load_stop_thread; /* bool */
+	volatile unsigned		gpu_load_stop_thread; /* bool */
+
+	char				renderer_string[64];
 };
 
 /* This encapsulates a state or an operation which can emitted into the GPU
@@ -298,6 +316,7 @@ struct r600_common_screen {
 struct r600_atom {
 	void (*emit)(struct r600_common_context *ctx, struct r600_atom *state);
 	unsigned		num_dw;
+	unsigned short		id;	/* used by r600 only */
 	bool			dirty;
 };
 
@@ -327,6 +346,10 @@ struct r600_streamout {
 	/* External state which comes from the vertex shader,
 	 * it must be set explicitly when binding a shader. */
 	unsigned			*stride_in_dw;
+	unsigned			enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
+
+	/* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
+	unsigned			hw_enabled_mask;
 
 	/* The state of VGT_STRMOUT_(CONFIG|EN). */
 	struct r600_atom		enable_atom;
@@ -352,10 +375,12 @@ struct r600_common_context {
 
 	struct r600_common_screen	*screen;
 	struct radeon_winsys		*ws;
+	struct radeon_winsys_ctx	*ctx;
 	enum radeon_family		family;
 	enum chip_class			chip_class;
 	struct r600_rings		rings;
 	unsigned			initial_gfx_cs_size;
+	unsigned			gpu_reset_counter;
 
 	struct u_upload_mgr		*uploader;
 	struct u_suballocator		*allocator_so_filled_size;
@@ -376,11 +401,14 @@ struct r600_common_context {
 	int				num_occlusion_queries;
 	/* Keep track of non-timer queries, because they should be suspended
 	 * during context flushing.
-	 * The timer queries (TIME_ELAPSED) shouldn't be suspended. */
+	 * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits,
+	 * but they should be suspended between IBs. */
 	struct list_head		active_nontimer_queries;
+	struct list_head		active_timer_queries;
 	unsigned			num_cs_dw_nontimer_queries_suspend;
+	unsigned			num_cs_dw_timer_queries_suspend;
 	/* If queries have been suspended. */
-	bool				nontimer_queries_suspended;
+	bool				queries_suspended_for_flush;
 	/* Additional hardware info. */
 	unsigned			backend_mask;
 	unsigned			max_db; /* for OQ */
@@ -441,6 +469,9 @@ struct r600_common_context {
 	/* This ensures there is enough space in the command stream. */
 	void (*need_gfx_cs_space)(struct pipe_context *ctx, unsigned num_dw,
 				  bool include_draw_vbo);
+
+	void (*set_atom_dirty)(struct r600_common_context *ctx,
+			       struct r600_atom *atom, bool dirty);
 };
 
 /* r600_buffer.c */
@@ -495,6 +526,8 @@ unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
 void r600_query_init(struct r600_common_context *rctx);
 void r600_suspend_nontimer_queries(struct r600_common_context *ctx);
 void r600_resume_nontimer_queries(struct r600_common_context *ctx);
+void r600_suspend_timer_queries(struct r600_common_context *ctx);
+void r600_resume_timer_queries(struct r600_common_context *ctx);
 void r600_query_init_backend_mask(struct r600_common_context *ctx);
 
 /* r600_streamout.c */
@@ -549,12 +582,12 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 
 /* Inline helpers. */
 
-static INLINE struct r600_resource *r600_resource(struct pipe_resource *r)
+static inline struct r600_resource *r600_resource(struct pipe_resource *r)
 {
 	return (struct r600_resource*)r;
 }
 
-static INLINE void
+static inline void
 r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
 {
 	pipe_resource_reference((struct pipe_resource **)ptr,
@@ -570,6 +603,26 @@ static inline unsigned r600_tex_aniso_filter(unsigned filter)
 	 /* else */        return 4;
 }
 
+static inline unsigned r600_wavefront_size(enum radeon_family family)
+{
+	switch (family) {
+	case CHIP_RV610:
+	case CHIP_RS780:
+	case CHIP_RV620:
+	case CHIP_RS880:
+		return 16;
+	case CHIP_RV630:
+	case CHIP_RV635:
+	case CHIP_RV730:
+	case CHIP_RV710:
+	case CHIP_PALM:
+	case CHIP_CEDAR:
+		return 32;
+	default:
+		return 64;
+	}
+}
+
 #define COMPUTE_DBG(rscreen, fmt, args...) \
 	do { \
 		if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 71f4a1522f9..7057aa19a7c 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -54,6 +54,8 @@ struct r600_query {
 	uint64_t end_result;
 	/* Fence for GPU_FINISHED. */
 	struct pipe_fence_handle *fence;
+	/* For transform feedback: which stream the query is for */
+	unsigned stream;
 };
 
 
@@ -90,6 +92,8 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_GPU_LOAD:
+	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		return NULL;
 	}
 
@@ -118,7 +122,6 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 			}
 			results += 4 * ctx->max_db;
 		}
-		ctx->ws->buffer_unmap(buf->cs_buf);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 	case PIPE_QUERY_TIMESTAMP:
@@ -130,7 +133,6 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
 		memset(results, 0, buf_size);
-		ctx->ws->buffer_unmap(buf->cs_buf);
 		break;
 	default:
 		assert(0);
@@ -157,6 +159,17 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
 	}
 }
 
+static unsigned event_type_for_stream(struct r600_query *query)
+{
+	switch (query->stream) {
+	default:
+	case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
+	case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
+	case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;
+	case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;
+	}
+}
+
 static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query)
 {
 	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
@@ -191,7 +204,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	case PIPE_QUERY_SO_STATISTICS:
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3));
+		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32UL) & 0xFF);
 		break;
@@ -215,9 +228,10 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_MIN);
 
-	if (!r600_is_timer_query(query->type)) {
+	if (r600_is_timer_query(query->type))
+		ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw;
+	else
 		ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw;
-	}
 }
 
 static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query)
@@ -248,7 +262,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
 		va += query->buffer.results_end + query->result_size/2;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3));
+		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
 		radeon_emit(cs, (va >> 32UL) & 0xFF);
 		break;
@@ -279,9 +293,10 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	query->buffer.results_end += query->result_size;
 
 	if (r600_query_needs_begin(query->type)) {
-		if (!r600_is_timer_query(query->type)) {
+		if (r600_is_timer_query(query->type))
+			ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw;
+		else
 			ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw;
-		}
 	}
 
 	r600_update_occlusion_query_state(ctx, query->type, -1);
@@ -292,6 +307,13 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 					int operation, bool flag_wait)
 {
 	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	uint32_t op = PRED_OP(operation);
+
+	/* if true then invert, see GL_ARB_conditional_render_inverted */
+	if (ctx->current_render_cond_cond)
+		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
+	else
+		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
 
 	if (operation == PREDICATION_OP_CLEAR) {
 		ctx->need_gfx_cs_space(&ctx->b, 3, FALSE);
@@ -302,24 +324,21 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 	} else {
 		struct r600_query_buffer *qbuf;
 		unsigned count;
-		uint32_t op;
-
 		/* Find how many results there are. */
 		count = 0;
 		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
 			count += qbuf->results_end / query->result_size;
 		}
-
+	
 		ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE);
-
-		op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
-				(flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
-
+	
+		op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+	
 		/* emit predicate packets for all data blocks */
 		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
 			unsigned results_base = 0;
 			uint64_t va = qbuf->buf->gpu_address;
-
+	
 			while (results_base < qbuf->results_end) {
 				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
 				radeon_emit(cs, (va + results_base) & 0xFFFFFFFFUL);
@@ -327,7 +346,7 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 				r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ,
 						RADEON_PRIO_MIN);
 				results_base += query->result_size;
-
+	
 				/* set CONTINUE bit for all packets except the first */
 				op |= PREDICATION_CONTINUE;
 			}
@@ -369,6 +388,7 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q
 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
 		query->result_size = 32;
 		query->num_cs_dw = 6;
+		query->stream = index;
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		/* 11 values on EG, 8 on R600. */
@@ -390,6 +410,8 @@ static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned q
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
 	case R600_QUERY_GPU_LOAD:
+	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		skip_allocation = true;
 		break;
 	default:
@@ -454,7 +476,7 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 		rquery->begin_result = 0;
 		return true;
 	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS);
+		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
 		return true;
 	case R600_QUERY_NUM_CS_FLUSHES:
 		rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
@@ -465,6 +487,12 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 	case R600_QUERY_GPU_LOAD:
 		rquery->begin_result = r600_gpu_load_begin(rctx->screen);
 		return true;
+	case R600_QUERY_NUM_COMPILATIONS:
+		rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+		return true;
+	case R600_QUERY_NUM_SHADERS_CREATED:
+		rquery->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		return true;
 	}
 
 	/* Discard the old query buffers. */
@@ -477,7 +505,7 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 
 	/* Obtain a new buffer if the current one can't be mapped without a stall. */
 	if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
-	    rctx->ws->buffer_is_busy(rquery->buffer.buf->buf, RADEON_USAGE_READWRITE)) {
+	    !rctx->ws->buffer_wait(rquery->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
 		pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
 		rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type);
 	}
@@ -487,9 +515,10 @@ static boolean r600_begin_query(struct pipe_context *ctx,
 
 	r600_emit_query_begin(rctx, rquery);
 
-	if (!r600_is_timer_query(rquery->type)) {
+	if (r600_is_timer_query(rquery->type))
+		LIST_ADDTAIL(&rquery->list, &rctx->active_timer_queries);
+	else
 		LIST_ADDTAIL(&rquery->list, &rctx->active_nontimer_queries);
-	}
    return true;
 }
 
@@ -515,7 +544,7 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_GTT_MEMORY);
 		return;
 	case R600_QUERY_BUFFER_WAIT_TIME:
-		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS);
+		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
 		return;
 	case R600_QUERY_NUM_CS_FLUSHES:
 		rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
@@ -541,13 +570,18 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 	case R600_QUERY_GPU_LOAD:
 		rquery->end_result = r600_gpu_load_end(rctx->screen, rquery->begin_result);
 		return;
+	case R600_QUERY_NUM_COMPILATIONS:
+		rquery->end_result = p_atomic_read(&rctx->screen->num_compilations);
+		return;
+	case R600_QUERY_NUM_SHADERS_CREATED:
+		rquery->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		return;
 	}
 
 	r600_emit_query_end(rctx, rquery);
 
-	if (r600_query_needs_begin(rquery->type) && !r600_is_timer_query(rquery->type)) {
+	if (r600_query_needs_begin(rquery->type))
 		LIST_DELINIT(&rquery->list);
-	}
 }
 
 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
@@ -601,6 +635,8 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 	case R600_QUERY_GPU_TEMPERATURE:
 	case R600_QUERY_CURRENT_GPU_SCLK:
 	case R600_QUERY_CURRENT_GPU_MCLK:
+	case R600_QUERY_NUM_COMPILATIONS:
+	case R600_QUERY_NUM_SHADERS_CREATED:
 		result->u64 = query->end_result - query->begin_result;
 		return TRUE;
 	case R600_QUERY_GPU_LOAD:
@@ -751,7 +787,6 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
 		assert(0);
 	}
 
-	ctx->ws->buffer_unmap(qbuf->buf->cs_buf);
 	return TRUE;
 }
 
@@ -823,22 +858,37 @@ static void r600_render_condition(struct pipe_context *ctx,
 	}
 }
 
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
+static void r600_suspend_queries(struct r600_common_context *ctx,
+				 struct list_head *query_list,
+				 unsigned *num_cs_dw_queries_suspend)
 {
 	struct r600_query *query;
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		r600_emit_query_end(ctx, query);
 	}
-	assert(ctx->num_cs_dw_nontimer_queries_suspend == 0);
+	assert(*num_cs_dw_queries_suspend == 0);
 }
 
-static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx)
+void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
+{
+	r600_suspend_queries(ctx, &ctx->active_nontimer_queries,
+			     &ctx->num_cs_dw_nontimer_queries_suspend);
+}
+
+void r600_suspend_timer_queries(struct r600_common_context *ctx)
+{
+	r600_suspend_queries(ctx, &ctx->active_timer_queries,
+			     &ctx->num_cs_dw_timer_queries_suspend);
+}
+
+static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
+						    struct list_head *query_list)
 {
 	struct r600_query *query;
 	unsigned num_dw = 0;
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		/* begin + end */
 		num_dw += query->num_cs_dw * 2;
 
@@ -857,21 +907,35 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *
 	return num_dw;
 }
 
-void r600_resume_nontimer_queries(struct r600_common_context *ctx)
+static void r600_resume_queries(struct r600_common_context *ctx,
+				struct list_head *query_list,
+				unsigned *num_cs_dw_queries_suspend)
 {
 	struct r600_query *query;
+	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list);
 
-	assert(ctx->num_cs_dw_nontimer_queries_suspend == 0);
+	assert(*num_cs_dw_queries_suspend == 0);
 
 	/* Check CS space here. Resuming must not be interrupted by flushes. */
-	ctx->need_gfx_cs_space(&ctx->b,
-			       r600_queries_num_cs_dw_for_resuming(ctx), TRUE);
+	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE);
 
-	LIST_FOR_EACH_ENTRY(query, &ctx->active_nontimer_queries, list) {
+	LIST_FOR_EACH_ENTRY(query, query_list, list) {
 		r600_emit_query_begin(ctx, query);
 	}
 }
 
+void r600_resume_nontimer_queries(struct r600_common_context *ctx)
+{
+	r600_resume_queries(ctx, &ctx->active_nontimer_queries,
+			    &ctx->num_cs_dw_nontimer_queries_suspend);
+}
+
+void r600_resume_timer_queries(struct r600_common_context *ctx)
+{
+	r600_resume_queries(ctx, &ctx->active_timer_queries,
+			    &ctx->num_cs_dw_timer_queries_suspend);
+}
+
 /* Get backends mask */
 void r600_query_init_backend_mask(struct r600_common_context *ctx)
 {
@@ -919,7 +983,6 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 	results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
 	if (results) {
 		memset(results, 0, ctx->max_db * 4 * 4);
-		ctx->ws->buffer_unmap(buffer->cs_buf);
 
 		/* emit EVENT_WRITE for ZPASS_DONE */
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -937,7 +1000,6 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 				if (results[i*4 + 1])
 					mask |= (1<<i);
 			}
-			ctx->ws->buffer_unmap(buffer->cs_buf);
 		}
 	}
 
@@ -966,4 +1028,5 @@ void r600_query_init(struct r600_common_context *rctx)
 	    rctx->b.render_condition = r600_render_condition;
 
 	LIST_INITHEAD(&rctx->active_nontimer_queries);
+	LIST_INITHEAD(&rctx->active_timer_queries);
 }
diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c
index bc8bf97ef89..0853f636a27 100644
--- a/src/gallium/drivers/radeon/r600_streamout.c
+++ b/src/gallium/drivers/radeon/r600_streamout.c
@@ -88,8 +88,7 @@ void r600_streamout_buffers_dirty(struct r600_common_context *rctx)
 		12 + /* flush_vgt_streamout */
 		num_bufs * 11; /* STRMOUT_BUFFER_UPDATE, BUFFER_SIZE */
 
-	begin->num_dw = 12 + /* flush_vgt_streamout */
-			3; /* VGT_STRMOUT_BUFFER_CONFIG */
+	begin->num_dw = 12; /* flush_vgt_streamout */
 
 	if (rctx->chip_class >= SI) {
 		begin->num_dw += num_bufs * 4; /* SET_CONTEXT_REG */
@@ -105,7 +104,7 @@ void r600_streamout_buffers_dirty(struct r600_common_context *rctx)
 		(num_bufs - num_bufs_appended) * 6 + /* STRMOUT_BUFFER_UPDATE */
 		(rctx->family > CHIP_R600 && rctx->family < CHIP_RS780 ? 2 : 0); /* SURFACE_BASE_UPDATE */
 
-	begin->dirty = true;
+	rctx->set_atom_dirty(rctx, begin, true);
 
 	r600_set_streamout_enable(rctx, true);
 }
@@ -146,7 +145,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
 	if (num_targets) {
 		r600_streamout_buffers_dirty(rctx);
 	} else {
-		rctx->streamout.begin_atom.dirty = false;
+		rctx->set_atom_dirty(rctx, &rctx->streamout.begin_atom, false);
 		r600_set_streamout_enable(rctx, false);
 	}
 }
@@ -192,11 +191,6 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 
 	r600_flush_vgt_streamout(rctx);
 
-	r600_write_context_reg(cs, rctx->chip_class >= EVERGREEN ?
-				       R_028B98_VGT_STRMOUT_BUFFER_CONFIG :
-				       R_028B20_VGT_STRMOUT_BUFFER_EN,
-			       rctx->streamout.enabled_mask);
-
 	for (i = 0; i < rctx->streamout.num_targets; i++) {
 		if (!t[i])
 			continue;
@@ -326,20 +320,42 @@ static bool r600_get_strmout_en(struct r600_common_context *rctx)
 static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 				       struct r600_atom *atom)
 {
-	r600_write_context_reg(rctx->rings.gfx.cs,
-			       rctx->chip_class >= EVERGREEN ?
-				       R_028B94_VGT_STRMOUT_CONFIG :
-				       R_028AB0_VGT_STRMOUT_EN,
-			       S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx)));
+	unsigned strmout_config_reg = R_028AB0_VGT_STRMOUT_EN;
+	unsigned strmout_config_val = S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx));
+	unsigned strmout_buffer_reg = R_028B20_VGT_STRMOUT_BUFFER_EN;
+	unsigned strmout_buffer_val = rctx->streamout.hw_enabled_mask &
+				      rctx->streamout.enabled_stream_buffers_mask;
+
+	if (rctx->chip_class >= EVERGREEN) {
+		strmout_buffer_reg = R_028B98_VGT_STRMOUT_BUFFER_CONFIG;
+
+		strmout_config_reg = R_028B94_VGT_STRMOUT_CONFIG;
+		strmout_config_val |=
+			S_028B94_RAST_STREAM(0) |
+			S_028B94_STREAMOUT_1_EN(r600_get_strmout_en(rctx)) |
+			S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) |
+			S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx));
+	}
+	r600_write_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val);
+	r600_write_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val);
 }
 
 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
 {
 	bool old_strmout_en = r600_get_strmout_en(rctx);
+	unsigned old_hw_enabled_mask = rctx->streamout.hw_enabled_mask;
 
 	rctx->streamout.streamout_enabled = enable;
-	if (old_strmout_en != r600_get_strmout_en(rctx))
-		rctx->streamout.enable_atom.dirty = true;
+
+	rctx->streamout.hw_enabled_mask = rctx->streamout.enabled_mask |
+					  (rctx->streamout.enabled_mask << 4) |
+					  (rctx->streamout.enabled_mask << 8) |
+					  (rctx->streamout.enabled_mask << 12);
+
+	if ((old_strmout_en != r600_get_strmout_en(rctx)) ||
+            (old_hw_enabled_mask != rctx->streamout.hw_enabled_mask)) {
+		rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true);
+	}
 }
 
 void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
@@ -354,8 +370,9 @@ void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
 		rctx->streamout.prims_gen_query_enabled =
 			rctx->streamout.num_prims_gen_queries != 0;
 
-		if (old_strmout_en != r600_get_strmout_en(rctx))
-			rctx->streamout.enable_atom.dirty = true;
+		if (old_strmout_en != r600_get_strmout_en(rctx)) {
+			rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true);
+		}
 	}
 }
 
@@ -365,5 +382,5 @@ void r600_streamout_init(struct r600_common_context *rctx)
 	rctx->b.stream_output_target_destroy = r600_so_target_destroy;
 	rctx->streamout.begin_atom.emit = r600_emit_streamout_begin;
 	rctx->streamout.enable_atom.emit = r600_emit_streamout_enable;
-	rctx->streamout.enable_atom.num_dw = 3;
+	rctx->streamout.enable_atom.num_dw = 6;
 }
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index dc510c99749..54696910e43 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -243,10 +243,11 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
 				       surface->level[0].mode >= RADEON_SURF_MODE_2D ?
 				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
+				       surface->pipe_config,
 				       surface->bankw, surface->bankh,
 				       surface->tile_split,
 				       surface->stencil_tile_split,
-				       surface->mtilea,
+				       surface->mtilea, surface->num_banks,
 				       surface->level[0].pitch_bytes,
 				       (surface->flags & RADEON_SURF_SCANOUT) != 0);
 
@@ -489,7 +490,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	unsigned num_pipes = rscreen->tiling_info.num_channels;
 
 	if (rscreen->chip_class <= EVERGREEN &&
-	    rscreen->info.drm_minor < 26)
+	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26)
 		return 0;
 
 	/* HW bug on R6xx. */
@@ -501,7 +502,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	/* HTILE is broken with 1D tiling on old kernels and CIK. */
 	if (rscreen->chip_class >= CIK &&
 	    rtex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
-	    rscreen->info.drm_minor < 38)
+	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
 		return 0;
 
 	switch (num_pipes) {
@@ -706,6 +707,7 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 				   const struct pipe_resource *templ)
 {
 	const struct util_format_description *desc = util_format_description(templ->format);
+	bool force_tiling = templ->flags & R600_RESOURCE_FLAG_FORCE_TILING;
 
 	/* MSAA resources must be 2D tiled. */
 	if (templ->nr_samples > 1)
@@ -715,10 +717,16 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 	if (templ->flags & R600_RESOURCE_FLAG_TRANSFER)
 		return RADEON_SURF_MODE_LINEAR_ALIGNED;
 
+	/* r600g: force tiling on TEXTURE_2D and TEXTURE_3D compute resources. */
+	if (rscreen->chip_class >= R600 && rscreen->chip_class <= CAYMAN &&
+	    (templ->bind & PIPE_BIND_COMPUTE_RESOURCE) &&
+	    (templ->target == PIPE_TEXTURE_2D ||
+	     templ->target == PIPE_TEXTURE_3D))
+		force_tiling = true;
+
 	/* Handle common candidates for the linear mode.
 	 * Compressed textures must always be tiled. */
-	if (!(templ->flags & R600_RESOURCE_FLAG_FORCE_TILING) &&
-	    !util_format_is_compressed(templ->format)) {
+	if (!force_tiling && !util_format_is_compressed(templ->format)) {
 		/* Not everything can be linear, so we cannot enforce it
 		 * for all textures. */
 		if ((rscreen->debug_flags & DBG_NO_TILING) &&
@@ -934,7 +942,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 		use_staging_texture = TRUE;
 	} else if (!(usage & PIPE_TRANSFER_READ) &&
 	    (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) ||
-	     rctx->ws->buffer_is_busy(rtex->resource.buf, RADEON_USAGE_READWRITE))) {
+	     !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) {
 		/* Use a staging texture for uploads if the underlying BO is busy. */
 		use_staging_texture = TRUE;
 	}
@@ -1059,18 +1067,9 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx,
 					struct pipe_transfer* transfer)
 {
 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
-	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
-	struct radeon_winsys_cs_handle *buf;
 	struct pipe_resource *texture = transfer->resource;
 	struct r600_texture *rtex = (struct r600_texture*)texture;
 
-	if (rtransfer->staging) {
-		buf = rtransfer->staging->cs_buf;
-	} else {
-		buf = r600_resource(transfer->resource)->cs_buf;
-	}
-	rctx->ws->buffer_unmap(buf);
-
 	if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtransfer->staging) {
 		if (rtex->is_depth && rtex->resource.b.b.nr_samples <= 1) {
 			ctx->resource_copy_region(ctx, texture, transfer->level,
@@ -1262,7 +1261,9 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 
 		/* fast color clear with 1D tiling doesn't work on old kernels and CIK */
 		if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
-		    rctx->chip_class >= CIK && rctx->screen->info.drm_minor < 38) {
+		    rctx->chip_class >= CIK &&
+		    rctx->screen->info.drm_major == 2 &&
+		    rctx->screen->info.drm_minor < 38) {
 			continue;
 		}
 
@@ -1278,7 +1279,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   tex->cmask.offset, tex->cmask.size, 0, true);
 
 		tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
-		fb_state->dirty = true;
+		rctx->set_atom_dirty(rctx, fb_state, true);
 		*buffers &= ~clear_bit;
 	}
 }
diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h
index 74c8d8782a6..115042d153e 100644
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -66,6 +66,9 @@
 #define PKT3_SET_SH_REG                        0x76 /* SI and later */
 #define PKT3_SET_UCONFIG_REG                   0x79 /* CIK and later */
 
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS1      0x1 /* EG and later */
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS2      0x2 /* EG and later */
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS3      0x3 /* EG and later */
 #define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
 #define EVENT_TYPE_ZPASS_DONE                  0x15
@@ -177,7 +180,7 @@
 #define   S_028804_INTERPOLATE_SRC_Z(x)			(((x) & 0x1) << 19)
 #define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) & 0x1) << 20)
 #define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) & 0x1) << 21)
-#define   S_028804_OVERRASTERIZATION_AMOUNT(x)		(((x) & 0x7) << 24)
+#define   S_028804_OVERRASTERIZATION_AMOUNT(x)		(((x) & 0x07) << 24)
 #define   S_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)	(((x) & 0x1) << 27)
 #define CM_R_028BDC_PA_SC_LINE_CNTL                  0x28bdc
 #define   S_028BDC_EXPAND_LINE_WIDTH(x)                (((x) & 0x1) << 9)
diff --git a/src/gallium/drivers/radeon/radeon_elf_util.c b/src/gallium/drivers/radeon/radeon_elf_util.c
index 9b508227fd4..2e45d439e7a 100644
--- a/src/gallium/drivers/radeon/radeon_elf_util.c
+++ b/src/gallium/drivers/radeon/radeon_elf_util.c
@@ -103,8 +103,7 @@ static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
 }
 
 void radeon_elf_read(const char *elf_data, unsigned elf_size,
-					struct radeon_shader_binary *binary,
-					unsigned debug)
+		     struct radeon_shader_binary *binary)
 {
 	char *elf_buffer;
 	Elf *elf;
@@ -124,7 +123,6 @@ void radeon_elf_read(const char *elf_data, unsigned elf_size,
 	elf = elf_memory(elf_buffer, elf_size);
 
 	elf_getshdrstrndx(elf, &section_str_index);
-	binary->disassembled = 0;
 
 	while ((section = elf_nextscn(elf, section))) {
 		const char *name;
@@ -145,12 +143,11 @@ void radeon_elf_read(const char *elf_data, unsigned elf_size,
 			binary->config_size = section_data->d_size;
 			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
 			memcpy(binary->config, section_data->d_buf, binary->config_size);
-		} else if (debug && !strcmp(name, ".AMDGPU.disasm")) {
-			binary->disassembled = 1;
+		} else if (!strcmp(name, ".AMDGPU.disasm")) {
+			/* Always read disassembly if it's available. */
 			section_data = elf_getdata(section, section_data);
-			fprintf(stderr, "\nShader Disassembly:\n\n");
-			fprintf(stderr, "%.*s\n", (int)section_data->d_size,
-						  (char *)section_data->d_buf);
+			binary->disasm_string = strndup(section_data->d_buf,
+							section_data->d_size);
 		} else if (!strncmp(name, ".rodata", 7)) {
 			section_data = elf_getdata(section, section_data);
 			binary->rodata_size = section_data->d_size;
diff --git a/src/gallium/drivers/radeon/radeon_elf_util.h b/src/gallium/drivers/radeon/radeon_elf_util.h
index ab83f98ea69..ea4ab2f14b2 100644
--- a/src/gallium/drivers/radeon/radeon_elf_util.h
+++ b/src/gallium/drivers/radeon/radeon_elf_util.h
@@ -37,7 +37,7 @@ struct radeon_shader_reloc;
  * radeon_shader_binary object.
  */
 void radeon_elf_read(const char *elf_data, unsigned elf_size,
-		struct radeon_shader_binary *binary, unsigned debug);
+		     struct radeon_shader_binary *binary);
 
 /**
  * @returns A pointer to the start of the configuration information for
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 6a9557b0b73..e967ad2214e 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -58,7 +58,6 @@ struct radeon_llvm_context {
 	unsigned type;
 	unsigned face_gpr;
 	unsigned two_side;
-	unsigned clip_vertex;
 	unsigned inputs_count;
 	struct r600_shader_io * r600_inputs;
 	struct r600_shader_io * r600_outputs;
@@ -72,21 +71,6 @@ struct radeon_llvm_context {
 
 	/*=== Front end configuration ===*/
 
-	/* Special Intrinsics */
-
-	/** Write to an output register: float store_output(float, i32) */
-	const char * store_output_intr;
-
-	/** Swizzle a vector value: <4 x float> swizzle(<4 x float>, i32)
-	 * The swizzle is an unsigned integer that encodes a TGSI_SWIZZLE_* value
-	 * in 2-bits.
-	 * Swizzle{0-1} = X Channel
-	 * Swizzle{2-3} = Y Channel
-	 * Swizzle{4-5} = Z Channel
-	 * Swizzle{6-7} = W Channel
-	 */
-	const char * swizzle_intr;
-
 	/* Instructions that are not described by any of the TGSI opcodes. */
 
 	/** This function is responsible for initilizing the inputs array and will be
@@ -100,9 +84,6 @@ struct radeon_llvm_context {
 			unsigned index,
 			const struct tgsi_full_declaration *decl);
 
-	/** User data to use with the callbacks */
-	void * userdata;
-
 	/** This array contains the input values for the shader.  Typically these
 	  * values will be in the form of a target intrinsic that will inform the
 	  * backend how to load the actual inputs to the shader. 
@@ -146,6 +127,8 @@ static inline LLVMTypeRef tgsi2llvmtype(
 	case TGSI_TYPE_UNSIGNED:
 	case TGSI_TYPE_SIGNED:
 		return LLVMInt32TypeInContext(ctx);
+	case TGSI_TYPE_DOUBLE:
+		return LLVMDoubleTypeInContext(ctx);
 	case TGSI_TYPE_UNTYPED:
 	case TGSI_TYPE_FLOAT:
 		return LLVMFloatTypeInContext(ctx);
@@ -171,8 +154,9 @@ static inline LLVMValueRef bitcast(
 
 
 void radeon_llvm_emit_prepare_cube_coords(struct lp_build_tgsi_context * bld_base,
-                                          struct lp_build_emit_data * emit_data,
-                                          LLVMValueRef *coords_arg);
+					  struct lp_build_emit_data * emit_data,
+					  LLVMValueRef *coords_arg,
+					  LLVMValueRef *derivs_arg);
 
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx);
 
@@ -191,20 +175,29 @@ unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan);
 
 void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx);
 
-LLVMValueRef
-build_intrinsic(LLVMBuilderRef builder,
-		const char *name,
-		LLVMTypeRef ret_type,
-		LLVMValueRef *args,
-		unsigned num_args,
-		LLVMAttribute attr);
-
 void
 build_tgsi_intrinsic_nomem(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data);
 
+LLVMValueRef
+radeon_llvm_emit_fetch_double(struct lp_build_tgsi_context *bld_base,
+			      LLVMValueRef ptr,
+			      LLVMValueRef ptr2);
 
+LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
+                                  LLVMValueRef value);
+
+LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
+				    const struct tgsi_full_src_register *reg,
+				    enum tgsi_opcode_type type,
+				    unsigned swizzle);
+
+void radeon_llvm_emit_store(
+	struct lp_build_tgsi_context * bld_base,
+	const struct tgsi_full_instruction * inst,
+	const struct tgsi_opcode_info * info,
+	LLVMValueRef dst[4]);
 
 #endif /* RADEON_LLVM_H */
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 25580b6bd4c..00025590137 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -62,6 +62,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type)
 
 	switch (type) {
 	case TGSI_PROCESSOR_VERTEX:
+	case TGSI_PROCESSOR_TESS_CTRL:
+	case TGSI_PROCESSOR_TESS_EVAL:
 		llvm_type = RADEON_LLVM_SHADER_VS;
 		break;
 	case TGSI_PROCESSOR_GEOMETRY:
@@ -142,7 +144,8 @@ static void radeonDiagnosticHandler(LLVMDiagnosticInfoRef di, void *context)
  * @returns 0 for success, 1 for failure
  */
 unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
-			  const char *gpu_family, unsigned dump, LLVMTargetMachineRef tm)
+			     const char *gpu_family, bool dump_ir, bool dump_asm,
+			     LLVMTargetMachineRef tm)
 {
 
 	char cpu[CPU_STRING_LEN];
@@ -165,17 +168,15 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 		}
 		strncpy(cpu, gpu_family, CPU_STRING_LEN);
 		memset(fs, 0, sizeof(fs));
-		if (dump) {
+		if (dump_asm)
 			strncpy(fs, "+DumpCode", FS_STRING_LEN);
-		}
 		tm = LLVMCreateTargetMachine(target, triple, cpu, fs,
 				  LLVMCodeGenLevelDefault, LLVMRelocDefault,
 						  LLVMCodeModelDefault);
 		dispose_tm = true;
 	}
-	if (dump) {
+	if (dump_ir)
 		LLVMDumpModule(M);
-	}
 	/* Setup Diagnostic Handler*/
 	llvm_ctx = LLVMGetModuleContext(M);
 
@@ -204,7 +205,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	buffer_size = LLVMGetBufferSize(out_buffer);
 	buffer_data = LLVMGetBufferStart(out_buffer);
 
-	radeon_elf_read(buffer_data, buffer_size, binary, dump);
+	radeon_elf_read(buffer_data, buffer_size, binary);
 
 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.h b/src/gallium/drivers/radeon/radeon_llvm_emit.h
index 3ccef78e36d..e20aed94c6b 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.h
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.h
@@ -29,6 +29,7 @@
 
 #include <llvm-c/Core.h>
 #include <llvm-c/TargetMachine.h>
+#include <stdbool.h>
 
 struct radeon_shader_binary;
 
@@ -36,11 +37,8 @@ void radeon_llvm_shader_type(LLVMValueRef F, unsigned type);
 
 LLVMTargetRef radeon_llvm_get_r600_target(const char *triple);
 
-unsigned  radeon_llvm_compile(
-	LLVMModuleRef M,
-	struct radeon_shader_binary *binary,
-	const char * gpu_family,
-	unsigned dump,
-	LLVMTargetMachineRef tm);
+unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binary,
+			     const char *gpu_family, bool dump_ir, bool dump_asm,
+			     LLVMTargetMachineRef tm);
 
 #endif /* RADEON_LLVM_EMIT_H */
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index c8c980d9d32..56694700a47 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -109,12 +109,27 @@ emit_array_index(
 	return LLVMBuildAdd(gallivm->builder, addr, lp_build_const_int32(gallivm, offset), "");
 }
 
-static LLVMValueRef
-emit_fetch(
+LLVMValueRef
+radeon_llvm_emit_fetch_double(
 	struct lp_build_tgsi_context *bld_base,
-	const struct tgsi_full_src_register *reg,
-	enum tgsi_opcode_type type,
-	unsigned swizzle);
+	LLVMValueRef ptr,
+	LLVMValueRef ptr2)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMValueRef result;
+
+	result = LLVMGetUndef(LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), bld_base->base.type.length * 2));
+
+	result = LLVMBuildInsertElement(builder,
+					result,
+					bitcast(bld_base, TGSI_TYPE_UNSIGNED, ptr),
+					bld_base->int_bld.zero, "");
+	result = LLVMBuildInsertElement(builder,
+					result,
+					bitcast(bld_base, TGSI_TYPE_UNSIGNED, ptr2),
+					bld_base->int_bld.one, "");
+	return bitcast(bld_base, TGSI_TYPE_DOUBLE, result);
+}
 
 static LLVMValueRef
 emit_array_fetch(
@@ -136,7 +151,7 @@ emit_array_fetch(
 
 	for (i = 0; i < size; ++i) {
 		tmp_reg.Register.Index = i + range.First;
-		LLVMValueRef temp = emit_fetch(bld_base, &tmp_reg, type, swizzle);
+		LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, &tmp_reg, type, swizzle);
 		result = LLVMBuildInsertElement(builder, result, temp,
 			lp_build_const_int32(gallivm, i), "");
 	}
@@ -150,23 +165,21 @@ static bool uses_temp_indirect_addressing(
 	return (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY));
 }
 
-static LLVMValueRef
-emit_fetch(
-	struct lp_build_tgsi_context *bld_base,
-	const struct tgsi_full_src_register *reg,
-	enum tgsi_opcode_type type,
-	unsigned swizzle)
+LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
+				    const struct tgsi_full_src_register *reg,
+				    enum tgsi_opcode_type type,
+				    unsigned swizzle)
 {
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
 	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-	LLVMValueRef result = NULL, ptr;
+	LLVMValueRef result = NULL, ptr, ptr2;
 
 	if (swizzle == ~0) {
 		LLVMValueRef values[TGSI_NUM_CHANNELS];
 		unsigned chan;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			values[chan] = emit_fetch(bld_base, reg, type, chan);
+			values[chan] = radeon_llvm_emit_fetch(bld_base, reg, type, chan);
 		}
 		return lp_build_gather_values(bld_base->base.gallivm, values,
 					      TGSI_NUM_CHANNELS);
@@ -184,11 +197,27 @@ emit_fetch(
 	switch(reg->Register.File) {
 	case TGSI_FILE_IMMEDIATE: {
 		LLVMTypeRef ctype = tgsi2llvmtype(bld_base, type);
-		return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
+		if (type == TGSI_TYPE_DOUBLE) {
+			result = LLVMGetUndef(LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), bld_base->base.type.length * 2));
+			result = LLVMConstInsertElement(result,
+							bld->immediates[reg->Register.Index][swizzle],
+							bld_base->int_bld.zero);
+			result = LLVMConstInsertElement(result,
+							bld->immediates[reg->Register.Index][swizzle + 1],
+							bld_base->int_bld.one);
+			return LLVMConstBitCast(result, ctype);
+		} else {
+			return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
+		}
 	}
 
 	case TGSI_FILE_INPUT:
 		result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr = result;
+			ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)];
+			return radeon_llvm_emit_fetch_double(bld_base, ptr, ptr2);
+		}
 		break;
 
 	case TGSI_FILE_TEMPORARY:
@@ -199,11 +228,23 @@ emit_fetch(
 			break;
 		}
 		ptr = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle];
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr2 = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle + 1];
+			return radeon_llvm_emit_fetch_double(bld_base,
+						 LLVMBuildLoad(builder, ptr, ""),
+						 LLVMBuildLoad(builder, ptr2, ""));
+		}
 		result = LLVMBuildLoad(builder, ptr, "");
 		break;
 
 	case TGSI_FILE_OUTPUT:
 		ptr = lp_get_output_ptr(bld, reg->Register.Index, swizzle);
+		if (type == TGSI_TYPE_DOUBLE) {
+			ptr2 = lp_get_output_ptr(bld, reg->Register.Index, swizzle + 1);
+			return radeon_llvm_emit_fetch_double(bld_base,
+						 LLVMBuildLoad(builder, ptr, ""),
+						 LLVMBuildLoad(builder, ptr2, ""));
+		}
 		result = LLVMBuildLoad(builder, ptr, "");
 		break;
 
@@ -321,8 +362,8 @@ static void emit_declaration(
 	}
 }
 
-static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
-                                         LLVMValueRef value)
+LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
+                                  LLVMValueRef value)
 {
 	struct lp_build_emit_data clamp_emit_data;
 
@@ -336,8 +377,7 @@ static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
 				  &clamp_emit_data);
 }
 
-static void
-emit_store(
+void radeon_llvm_emit_store(
 	struct lp_build_tgsi_context * bld_base,
 	const struct tgsi_full_instruction * inst,
 	const struct tgsi_opcode_info * info,
@@ -348,9 +388,10 @@ emit_store(
 	struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
 	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 	LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-	LLVMValueRef temp_ptr;
+	LLVMValueRef temp_ptr, temp_ptr2 = NULL;
 	unsigned chan, chan_index;
 	boolean is_vec_store = FALSE;
+	enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
 
 	if (dst[0]) {
 		LLVMTypeKind k = LLVMGetTypeKind(LLVMTypeOf(dst[0]));
@@ -371,6 +412,8 @@ emit_store(
 	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
 		LLVMValueRef value = dst[chan_index];
 
+		if (dtype == TGSI_TYPE_DOUBLE && (chan_index == 1 || chan_index == 3))
+			continue;
 		if (inst->Instruction.Saturate)
 			value = radeon_llvm_saturate(bld_base, value);
 
@@ -379,8 +422,9 @@ emit_store(
 			LLVMBuildStore(builder, value, temp_ptr);
 			continue;
 		}
-	
-		value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
+
+		if (dtype != TGSI_TYPE_DOUBLE)
+			value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
 
 		if (reg->Register.Indirect) {
 			struct tgsi_declaration_range range = get_array_range(bld_base,
@@ -418,6 +462,8 @@ emit_store(
 			switch(reg->Register.File) {
 			case TGSI_FILE_OUTPUT:
 				temp_ptr = bld->outputs[reg->Register.Index][chan_index];
+				if (dtype == TGSI_TYPE_DOUBLE)
+					temp_ptr2 = bld->outputs[reg->Register.Index][chan_index + 1];
 				break;
 
 			case TGSI_FILE_TEMPORARY:
@@ -428,12 +474,28 @@ emit_store(
 					break;
 				}
 				temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index];
+				if (dtype == TGSI_TYPE_DOUBLE)
+					temp_ptr2 = ctx->temps[ TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1];
+
 				break;
 
 			default:
 				return;
 			}
-			LLVMBuildStore(builder, value, temp_ptr);
+			if (dtype != TGSI_TYPE_DOUBLE)
+				LLVMBuildStore(builder, value, temp_ptr);
+			else {
+				LLVMValueRef ptr = LLVMBuildBitCast(builder, value,
+								    LLVMVectorType(LLVMIntTypeInContext(bld_base->base.gallivm->context, 32), 2), "");
+				LLVMValueRef val2;
+				value = LLVMBuildExtractElement(builder, ptr,
+								bld_base->uint_bld.zero, "");
+				val2 = LLVMBuildExtractElement(builder, ptr,
+								bld_base->uint_bld.one, "");
+
+				LLVMBuildStore(builder, bitcast(bld_base, TGSI_TYPE_FLOAT, value), temp_ptr);
+				LLVMBuildStore(builder, bitcast(bld_base, TGSI_TYPE_FLOAT, val2), temp_ptr2);
+			}
 		}
 	}
 }
@@ -686,34 +748,26 @@ static void kil_emit(
 	}
 }
 
-void radeon_llvm_emit_prepare_cube_coords(
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data,
-		LLVMValueRef *coords_arg)
+static void radeon_llvm_cube_to_2d_coords(struct lp_build_tgsi_context *bld_base,
+					  LLVMValueRef *in, LLVMValueRef *out)
 {
-
-	unsigned target = emit_data->inst->Texture.Texture;
-	unsigned opcode = emit_data->inst->Instruction.Opcode;
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	LLVMTypeRef type = bld_base->base.elem_type;
 	LLVMValueRef coords[4];
 	LLVMValueRef mad_args[3];
-	LLVMValueRef idx;
-	struct LLVMOpaqueValue *cube_vec;
-	LLVMValueRef v;
+	LLVMValueRef v, cube_vec;
 	unsigned i;
 
-	cube_vec = lp_build_gather_values(bld_base->base.gallivm, coords_arg, 4);
-	v = build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4),
+	cube_vec = lp_build_gather_values(bld_base->base.gallivm, in, 4);
+	v = lp_build_intrinsic(builder, "llvm.AMDGPU.cube", LLVMVectorType(type, 4),
                             &cube_vec, 1, LLVMReadNoneAttribute);
 
-	for (i = 0; i < 4; ++i) {
-		idx = lp_build_const_int32(gallivm, i);
-		coords[i] = LLVMBuildExtractElement(builder, v, idx, "");
-	}
+	for (i = 0; i < 4; ++i)
+		coords[i] = LLVMBuildExtractElement(builder, v,
+						    lp_build_const_int32(gallivm, i), "");
 
-	coords[2] = build_intrinsic(builder, "fabs",
+	coords[2] = lp_build_intrinsic(builder, "llvm.fabs.f32",
 			type, &coords[2], 1, LLVMReadNoneAttribute);
 	coords[2] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_RCP, coords[2]);
 
@@ -729,10 +783,60 @@ void radeon_llvm_emit_prepare_cube_coords(
 			mad_args[0], mad_args[1], mad_args[2]);
 
 	/* apply xyz = yxw swizzle to cooords */
-	coords[2] = coords[3];
-	coords[3] = coords[1];
-	coords[1] = coords[0];
-	coords[0] = coords[3];
+	out[0] = coords[1];
+	out[1] = coords[0];
+	out[2] = coords[3];
+}
+
+void radeon_llvm_emit_prepare_cube_coords(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data,
+		LLVMValueRef *coords_arg,
+		LLVMValueRef *derivs_arg)
+{
+
+	unsigned target = emit_data->inst->Texture.Texture;
+	unsigned opcode = emit_data->inst->Instruction.Opcode;
+	struct gallivm_state * gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef coords[4];
+	unsigned i;
+
+	radeon_llvm_cube_to_2d_coords(bld_base, coords_arg, coords);
+
+	if (opcode == TGSI_OPCODE_TXD && derivs_arg) {
+		LLVMValueRef derivs[4];
+		int axis;
+
+		/* Convert cube derivatives to 2D derivatives. */
+		for (axis = 0; axis < 2; axis++) {
+			LLVMValueRef shifted_cube_coords[4], shifted_coords[4];
+
+			/* Shift the cube coordinates by the derivatives to get
+			 * the cube coordinates of the "neighboring pixel".
+			 */
+			for (i = 0; i < 3; i++)
+				shifted_cube_coords[i] =
+					LLVMBuildFAdd(builder, coords_arg[i],
+						      derivs_arg[axis*3+i], "");
+			shifted_cube_coords[3] = LLVMGetUndef(bld_base->base.elem_type);
+
+			/* Project the shifted cube coordinates onto the face. */
+			radeon_llvm_cube_to_2d_coords(bld_base, shifted_cube_coords,
+						      shifted_coords);
+
+			/* Subtract both sets of 2D coordinates to get 2D derivatives.
+			 * This won't work if the shifted coordinates ended up
+			 * in a different face.
+			 */
+			for (i = 0; i < 2; i++)
+				derivs[axis * 2 + i] =
+					LLVMBuildFSub(builder, shifted_coords[i],
+						      coords[i], "");
+		}
+
+		memcpy(derivs_arg, derivs, sizeof(derivs));
+	}
 
 	if (target == TGSI_TEXTURE_CUBE_ARRAY ||
 	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
@@ -756,140 +860,6 @@ void radeon_llvm_emit_prepare_cube_coords(
 	memcpy(coords_arg, coords, sizeof(coords));
 }
 
-static void txd_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[4];
-	unsigned chan, src;
-	for (src = 0; src < 3; src++) {
-		for (chan = 0; chan < 4; chan++)
-			coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan);
-
-		emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm,
-				coords, 4);
-	}
-	emit_data->arg_count = 3;
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-
-static void txp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	LLVMValueRef src_w;
-	unsigned chan;
-	LLVMValueRef coords[5];
-
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-
-	for (chan = 0; chan < 3; chan++ ) {
-		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		coords[chan] = lp_build_emit_llvm_binary(bld_base,
-					TGSI_OPCODE_DIV, arg, src_w);
-	}
-	coords[3] = bld_base->base.one;
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
-	}
-
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->arg_count = 1;
-}
-
-static void tex_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	/* XXX: lp_build_swizzle_aos() was failing with wrong arg types,
-	 * when we used CHAN_ALL.  We should be able to get this to work,
-	 * but for now we will swizzle it ourselves
-	emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
-						 0, CHAN_ALL);
-
-	*/
-
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[5];
-	unsigned chan;
-	for (chan = 0; chan < 4; chan++) {
-		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
-	}
-
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
-		/* These instructions have additional operand that should be packed
-		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
-		 * That operand should be passed as a float value in the args array
-		 * right after the coord vector. After packing it's not used anymore,
-		 * that's why arg_count is not increased */
-		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, 0);
-	}
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
-	}
-
-	emit_data->arg_count = 1;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-static void txf_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	const struct tgsi_texture_offset * off = inst->TexOffsets;
-	LLVMTypeRef offset_type = bld_base->int_bld.elem_type;
-
-	/* fetch tex coords */
-	tex_fetch_args(bld_base, emit_data);
-
-	/* fetch tex offsets */
-	if (inst->Texture.NumOffsets) {
-		assert(inst->Texture.NumOffsets == 1);
-
-		emit_data->args[1] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleX],
-			offset_type);
-		emit_data->args[2] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleY],
-			offset_type);
-		emit_data->args[3] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleZ],
-			offset_type);
-	} else {
-		emit_data->args[1] = bld_base->int_bld.zero;
-		emit_data->args[2] = bld_base->int_bld.zero;
-		emit_data->args[3] = bld_base->int_bld.zero;
-	}
-
-	emit_data->arg_count = 4;
-}
-
 static void emit_icmp(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -996,6 +966,35 @@ static void emit_fcmp(
 	emit_data->output[emit_data->chan] = v;
 }
 
+static void emit_dcmp(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef context = bld_base->base.gallivm->context;
+	LLVMRealPredicate pred;
+
+	/* Use ordered for everything but NE (which is usual for
+	 * float comparisons)
+	 */
+	switch (emit_data->inst->Instruction.Opcode) {
+	case TGSI_OPCODE_DSEQ: pred = LLVMRealOEQ; break;
+	case TGSI_OPCODE_DSGE: pred = LLVMRealOGE; break;
+	case TGSI_OPCODE_DSLT: pred = LLVMRealOLT; break;
+	case TGSI_OPCODE_DSNE: pred = LLVMRealUNE; break;
+	default: assert(!"unknown instruction"); pred = 0; break;
+	}
+
+	LLVMValueRef v = LLVMBuildFCmp(builder, pred,
+			emit_data->args[0], emit_data->args[1],"");
+
+	v = LLVMBuildSExtOrBitCast(builder, v,
+			LLVMInt32TypeInContext(context), "");
+
+	emit_data->output[emit_data->chan] = v;
+}
+
 static void emit_not(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1161,6 +1160,40 @@ static void emit_ineg(
 			emit_data->args[0], "");
 }
 
+static void emit_dneg(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildFNeg(builder,
+			emit_data->args[0], "");
+}
+
+static void emit_frac(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	char *intr;
+
+	if (emit_data->info->opcode == TGSI_OPCODE_FRC)
+		intr = "llvm.floor.f32";
+	else if (emit_data->info->opcode == TGSI_OPCODE_DFRAC)
+		intr = "llvm.floor.f64";
+	else {
+		assert(0);
+		return;
+	}
+
+	LLVMValueRef floor = lp_build_intrinsic(builder, intr, emit_data->dst_type,
+						&emit_data->args[0], 1,
+						LLVMReadNoneAttribute);
+	emit_data->output[emit_data->chan] = LLVMBuildFSub(builder,
+			emit_data->args[0], floor, "");
+}
+
 static void emit_f2i(
 		const struct lp_build_tgsi_action * action,
 		struct lp_build_tgsi_context * bld_base,
@@ -1215,58 +1248,16 @@ static void emit_immediate(struct lp_build_tgsi_context * bld_base,
 	ctx->soa.num_immediates++;
 }
 
-LLVMValueRef
-build_intrinsic(LLVMBuilderRef builder,
-                   const char *name,
-                   LLVMTypeRef ret_type,
-                   LLVMValueRef *args,
-                   unsigned num_args,
-                   LLVMAttribute attr)
-{
-   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
-   LLVMValueRef function;
-
-   function = LLVMGetNamedFunction(module, name);
-   if(!function) {
-      LLVMTypeRef arg_types[LP_MAX_FUNC_ARGS];
-      unsigned i;
-
-      assert(num_args <= LP_MAX_FUNC_ARGS);
-
-      for(i = 0; i < num_args; ++i) {
-         assert(args[i]);
-         arg_types[i] = LLVMTypeOf(args[i]);
-      }
-
-      function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
-
-      if (attr)
-          LLVMAddFunctionAttr(function, attr);
-   }
-
-   return LLVMBuildCall(builder, function, args, num_args, "");
-}
-
-static void build_tgsi_intrinsic(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data,
- LLVMAttribute attr)
-{
-   struct lp_build_context * base = &bld_base->base;
-   emit_data->output[emit_data->chan] = build_intrinsic(
-               base->gallivm->builder, action->intr_name,
-               emit_data->dst_type, emit_data->args,
-               emit_data->arg_count, attr);
-}
-
 void
-build_tgsi_intrinsic_nomem(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data)
+build_tgsi_intrinsic_nomem(const struct lp_build_tgsi_action *action,
+			   struct lp_build_tgsi_context *bld_base,
+			   struct lp_build_emit_data *emit_data)
 {
-	build_tgsi_intrinsic(action, bld_base, emit_data, LLVMReadNoneAttribute);
+	struct lp_build_context * base = &bld_base->base;
+	emit_data->output[emit_data->chan] =
+		lp_build_intrinsic(base->gallivm->builder, action->intr_name,
+				   emit_data->dst_type, emit_data->args,
+				   emit_data->arg_count, LLVMReadNoneAttribute);
 }
 
 static void emit_bfi(const struct lp_build_tgsi_action * action,
@@ -1322,7 +1313,7 @@ static void emit_lsb(const struct lp_build_tgsi_action * action,
 	};
 
 	emit_data->output[emit_data->chan] =
-		build_intrinsic(gallivm->builder, "llvm.cttz.i32",
+		lp_build_intrinsic(gallivm->builder, "llvm.cttz.i32",
 				emit_data->dst_type, args, Elements(args),
 				LLVMReadNoneAttribute);
 }
@@ -1341,7 +1332,7 @@ static void emit_umsb(const struct lp_build_tgsi_action * action,
 	};
 
 	LLVMValueRef msb =
-		build_intrinsic(builder, "llvm.ctlz.i32",
+		lp_build_intrinsic(builder, "llvm.ctlz.i32",
 				emit_data->dst_type, args, Elements(args),
 				LLVMReadNoneAttribute);
 
@@ -1368,7 +1359,7 @@ static void emit_imsb(const struct lp_build_tgsi_action * action,
 	LLVMValueRef arg = emit_data->args[0];
 
 	LLVMValueRef msb =
-		build_intrinsic(builder, "llvm.AMDGPU.flbit.i32",
+		lp_build_intrinsic(builder, "llvm.AMDGPU.flbit.i32",
 				emit_data->dst_type, &arg, 1,
 				LLVMReadNoneAttribute);
 
@@ -1407,12 +1398,8 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 						ctx->gallivm.context);
 	ctx->gallivm.builder = LLVMCreateBuilderInContext(ctx->gallivm.context);
 
-	ctx->store_output_intr = "llvm.AMDGPU.store.output.";
-	ctx->swizzle_intr = "llvm.AMDGPU.swizzle";
 	struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base;
 
-	/* XXX: We need to revisit this.I think the correct way to do this is
-	 * to use length = 4 here and use the elem_bld for everything. */
 	type.floating = TRUE;
 	type.fixed = FALSE;
 	type.sign = TRUE;
@@ -1423,28 +1410,32 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	lp_build_context_init(&bld_base->base, &ctx->gallivm, type);
 	lp_build_context_init(&ctx->soa.bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type));
 	lp_build_context_init(&ctx->soa.bld_base.int_bld, &ctx->gallivm, lp_int_type(type));
+	{
+		struct lp_type dbl_type;
+		dbl_type = type;
+		dbl_type.width *= 2;
+		lp_build_context_init(&ctx->soa.bld_base.dbl_bld, &ctx->gallivm, dbl_type);
+	}
 
 	bld_base->soa = 1;
-	bld_base->emit_store = emit_store;
+	bld_base->emit_store = radeon_llvm_emit_store;
 	bld_base->emit_swizzle = emit_swizzle;
 	bld_base->emit_declaration = emit_declaration;
 	bld_base->emit_immediate = emit_immediate;
 
-	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch;
-	bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = radeon_llvm_emit_fetch;
+	bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = radeon_llvm_emit_fetch;
 	bld_base->emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = fetch_system_value;
 
 	/* Allocate outputs */
 	ctx->soa.outputs = ctx->outputs;
 
-	/* XXX: Is there a better way to initialize all this ? */
-
 	lp_set_default_actions(bld_base);
 
 	bld_base->op_actions[TGSI_OPCODE_ABS].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "fabs";
+	bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "llvm.fabs.f32";
 	bld_base->op_actions[TGSI_OPCODE_AND].emit = emit_and;
 	bld_base->op_actions[TGSI_OPCODE_ARL].emit = emit_arl;
 	bld_base->op_actions[TGSI_OPCODE_BFI].emit = emit_bfi;
@@ -1453,7 +1444,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_BREV].intr_name = "llvm.AMDGPU.brev";
 	bld_base->op_actions[TGSI_OPCODE_BRK].emit = brk_emit;
 	bld_base->op_actions[TGSI_OPCODE_CEIL].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "ceil";
+	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.ceil.f32";
 	bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp.";
 	bld_base->op_actions[TGSI_OPCODE_CMP].emit = build_tgsi_intrinsic_nomem;
@@ -1461,21 +1452,30 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit;
 	bld_base->op_actions[TGSI_OPCODE_COS].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.cos.f32";
-	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
-	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
-	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_DABS].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DABS].intr_name = "llvm.fabs.f64";
+	bld_base->op_actions[TGSI_OPCODE_DFMA].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DFMA].intr_name = "llvm.fma.f64";
+	bld_base->op_actions[TGSI_OPCODE_DFRAC].emit = emit_frac;
+	bld_base->op_actions[TGSI_OPCODE_DNEG].emit = emit_dneg;
+	bld_base->op_actions[TGSI_OPCODE_DSEQ].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSGE].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSLT].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DSNE].emit = emit_dcmp;
+	bld_base->op_actions[TGSI_OPCODE_DRSQ].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DRSQ].intr_name = "llvm.AMDGPU.rsq.f64";
+	bld_base->op_actions[TGSI_OPCODE_DSQRT].emit = build_tgsi_intrinsic_nomem;
+	bld_base->op_actions[TGSI_OPCODE_DSQRT].intr_name = "llvm.sqrt.f64";
 	bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit;
 	bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
 	bld_base->op_actions[TGSI_OPCODE_EX2].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp.";
 	bld_base->op_actions[TGSI_OPCODE_FLR].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "floor";
+	bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.floor.f32";
 	bld_base->op_actions[TGSI_OPCODE_FMA].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_FMA].intr_name = "llvm.fma.f32";
-	bld_base->op_actions[TGSI_OPCODE_FRC].emit = build_tgsi_intrinsic_nomem;
-	bld_base->op_actions[TGSI_OPCODE_FRC].intr_name = "llvm.AMDIL.fraction.";
+	bld_base->op_actions[TGSI_OPCODE_FRC].emit = emit_frac;
 	bld_base->op_actions[TGSI_OPCODE_F2I].emit = emit_f2i;
 	bld_base->op_actions[TGSI_OPCODE_F2U].emit = emit_f2u;
 	bld_base->op_actions[TGSI_OPCODE_FSEQ].emit = emit_fcmp;
@@ -1520,6 +1520,9 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.pow.f32";
 	bld_base->op_actions[TGSI_OPCODE_ROUND].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest.";
+	bld_base->op_actions[TGSI_OPCODE_RSQ].intr_name =
+		HAVE_LLVM >= 0x0305 ? "llvm.AMDGPU.rsq.clamped.f32" : "llvm.AMDGPU.rsq";
+	bld_base->op_actions[TGSI_OPCODE_RSQ].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_SGE].emit = emit_cmp;
 	bld_base->op_actions[TGSI_OPCODE_SEQ].emit = emit_cmp;
 	bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl;
@@ -1532,26 +1535,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_SQRT].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_SQRT].intr_name = "llvm.sqrt.f32";
 	bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg;
-	bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
-	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
-	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = build_tgsi_intrinsic_nomem;
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc";
 	bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd;
@@ -1571,13 +1554,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f;
 	bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor;
 	bld_base->op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp;
-
-	bld_base->rsq_action.emit = build_tgsi_intrinsic_nomem;
-#if HAVE_LLVM >= 0x0305
-	bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq.clamped.f32";
-#else
-	bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq";
-#endif
 }
 
 void radeon_llvm_create_func(struct radeon_llvm_context * ctx,
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index be58d0b9ce3..16ee5410273 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -57,6 +57,7 @@
 
 #define FB_BUFFER_OFFSET 0x1000
 #define FB_BUFFER_SIZE 2048
+#define IT_SCALING_TABLE_SIZE 992
 
 /* UVD decoder representation */
 struct ruvd_decoder {
@@ -65,6 +66,7 @@ struct ruvd_decoder {
 	ruvd_set_dtb			set_dtb;
 
 	unsigned			stream_handle;
+	unsigned			stream_type;
 	unsigned			frame_number;
 
 	struct pipe_screen		*screen;
@@ -73,15 +75,18 @@ struct ruvd_decoder {
 
 	unsigned			cur_buffer;
 
-	struct rvid_buffer		msg_fb_buffers[NUM_BUFFERS];
+	struct rvid_buffer		msg_fb_it_buffers[NUM_BUFFERS];
 	struct ruvd_msg			*msg;
 	uint32_t			*fb;
+	uint8_t				*it;
 
 	struct rvid_buffer		bs_buffers[NUM_BUFFERS];
 	void*				bs_ptr;
 	unsigned			bs_size;
 
 	struct rvid_buffer		dpb;
+	bool				use_legacy;
+	struct rvid_buffer		ctx;
 };
 
 /* flush IB to the hardware */
@@ -107,19 +112,34 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
 
 	reloc_idx = dec->ws->cs_add_reloc(dec->cs, cs_buf, usage, domain,
 					  RADEON_PRIO_MIN);
-	set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
-	set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
+	if (!dec->use_legacy) {
+		uint64_t addr;
+		addr = dec->ws->buffer_get_virtual_address(cs_buf);
+		addr = addr + off;
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr);
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32);
+	} else {
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
+		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
+	}
 	set_reg(dec, RUVD_GPCOM_VCPU_CMD, cmd << 1);
 }
 
-/* map the next available message/feedback buffer */
-static void map_msg_fb_buf(struct ruvd_decoder *dec)
+/* do the codec needs an IT buffer ?*/
+static bool have_it(struct ruvd_decoder *dec)
+{
+	return dec->stream_type == RUVD_CODEC_H264_PERF ||
+		dec->stream_type == RUVD_CODEC_H265;
+}
+
+/* map the next available message/feedback/itscaling buffer */
+static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
 {
 	struct rvid_buffer* buf;
 	uint8_t *ptr;
 
 	/* grab the current message/feedback buffer */
-	buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* and map it for CPU access */
 	ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE);
@@ -127,6 +147,8 @@ static void map_msg_fb_buf(struct ruvd_decoder *dec)
 	/* calc buffer offsets */
 	dec->msg = (struct ruvd_msg *)ptr;
 	dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
+	if (have_it(dec))
+		dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);
 }
 
 /* unmap and send a message command to the VCPU */
@@ -139,12 +161,13 @@ static void send_msg_buf(struct ruvd_decoder *dec)
 		return;
 
 	/* grab the current message buffer */
-	buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* unmap the buffer */
 	dec->ws->buffer_unmap(buf->res->cs_buf);
 	dec->msg = NULL;
 	dec->fb = NULL;
+	dec->it = NULL;
 
 	/* and send it to the hardware */
 	send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0,
@@ -159,11 +182,12 @@ static void next_buffer(struct ruvd_decoder *dec)
 }
 
 /* convert the profile into something UVD understands */
-static uint32_t profile2stream_type(enum pipe_video_profile profile)
+static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 {
-	switch (u_reduce_video_profile(profile)) {
+	switch (u_reduce_video_profile(dec->base.profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-		return RUVD_CODEC_H264;
+		return (family >= CHIP_TONGA) ?
+			RUVD_CODEC_H264_PERF : RUVD_CODEC_H264;
 
 	case PIPE_VIDEO_FORMAT_VC1:
 		return RUVD_CODEC_VC1;
@@ -174,23 +198,46 @@ static uint32_t profile2stream_type(enum pipe_video_profile profile)
 	case PIPE_VIDEO_FORMAT_MPEG4:
 		return RUVD_CODEC_MPEG4;
 
+	case PIPE_VIDEO_FORMAT_HEVC:
+		return RUVD_CODEC_H265;
+
 	default:
 		assert(0);
 		return 0;
 	}
 }
 
+static unsigned calc_ctx_size(struct ruvd_decoder *dec)
+{
+	unsigned width_in_mb, height_in_mb, ctx_size;
+
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	if (dec->base.width * dec->base.height >= 4096*2000)
+		max_references = MAX2(max_references, 8);
+	else
+		max_references = MAX2(max_references, 17);
+
+	width = align (width, 16);
+	height = align (height, 16);
+	ctx_size = ((width + 255) / 16)*((height + 255) / 16) * 16 * max_references + 52 * 1024;
+	return ctx_size;
+}
+
 /* calculate size of reference picture buffer */
-static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
+static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 {
 	unsigned width_in_mb, height_in_mb, image_size, dpb_size;
 
 	// always align them to MB size for dpb calculation
-	unsigned width = align(templ->width, VL_MACROBLOCK_WIDTH);
-	unsigned height = align(templ->height, VL_MACROBLOCK_HEIGHT);
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
 
 	// always one more for currently decoded picture
-	unsigned max_references = templ->max_references + 1;
+	unsigned max_references = dec->base.max_references + 1;
 
 	// aligned size of a single frame
 	image_size = width * height;
@@ -201,19 +248,67 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
 	width_in_mb = width / VL_MACROBLOCK_WIDTH;
 	height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2);
 
-	switch (u_reduce_video_profile(templ->profile)) {
-	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-		// the firmware seems to allways assume a minimum of ref frames
-		max_references = MAX2(NUM_H264_REFS, max_references);
+	switch (u_reduce_video_profile(dec->base.profile)) {
+	case PIPE_VIDEO_FORMAT_MPEG4_AVC: {
+		if (!dec->use_legacy) {
+			unsigned fs_in_mb = width_in_mb * height_in_mb;
+			unsigned alignment = 64, num_dpb_buffer;
 
-		// reference picture buffer
-		dpb_size = image_size * max_references;
+			if (dec->stream_type == RUVD_CODEC_H264_PERF)
+				alignment = 256;
+			switch(dec->base.level) {
+			case 30:
+				num_dpb_buffer = 8100 / fs_in_mb;
+				break;
+			case 31:
+				num_dpb_buffer = 18000 / fs_in_mb;
+				break;
+			case 32:
+				num_dpb_buffer = 20480 / fs_in_mb;
+				break;
+			case 41:
+				num_dpb_buffer = 32768 / fs_in_mb;
+				break;
+			case 42:
+				num_dpb_buffer = 34816 / fs_in_mb;
+				break;
+			case 50:
+				num_dpb_buffer = 110400 / fs_in_mb;
+				break;
+			case 51:
+				num_dpb_buffer = 184320 / fs_in_mb;
+				break;
+			default:
+				num_dpb_buffer = 184320 / fs_in_mb;
+				break;
+			}
+			num_dpb_buffer++;
+			max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
+			dpb_size = image_size * max_references;
+			dpb_size += max_references * align(width_in_mb * height_in_mb  * 192, alignment);
+			dpb_size += align(width_in_mb * height_in_mb * 32, alignment);
+		} else {
+			// the firmware seems to allways assume a minimum of ref frames
+			max_references = MAX2(NUM_H264_REFS, max_references);
+			// reference picture buffer
+			dpb_size = image_size * max_references;
+			// macroblock context buffer
+			dpb_size += width_in_mb * height_in_mb * max_references * 192;
+			// IT surface buffer
+			dpb_size += width_in_mb * height_in_mb * 32;
+		}
+		break;
+	}
 
-		// macroblock context buffer
-		dpb_size += width_in_mb * height_in_mb * max_references * 192;
+	case PIPE_VIDEO_FORMAT_HEVC:
+		if (dec->base.width * dec->base.height >= 4096*2000)
+			max_references = MAX2(max_references, 8);
+		else
+			max_references = MAX2(max_references, 17);
 
-		// IT surface buffer
-		dpb_size += width_in_mb * height_in_mb * 32;
+		width = align (width, 16);
+		height = align (height, 16);
+		dpb_size = align((width * height * 3) / 2, 256) * max_references;
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -250,6 +345,8 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
 
 		// IT surface buffer
 		dpb_size += align(width_in_mb * height_in_mb * 32, 64);
+
+		dpb_size = MAX2(dpb_size, 30 * 1024 * 1024);
 		break;
 
 	default:
@@ -263,6 +360,12 @@ static unsigned calc_dpb_size(const struct pipe_video_codec *templ)
 	return dpb_size;
 }
 
+/* free associated data in the video buffer callback */
+static void ruvd_destroy_associated_data(void *data)
+{
+	/* NOOP, since we only use an intptr */
+}
+
 /* get h264 specific message bits */
 static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_picture_desc *pic)
 {
@@ -286,10 +389,8 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 		assert(0);
 		break;
 	}
-	if (((dec->base.width * dec->base.height) >> 8) <= 1620)
-		result.level = 30;
-	else
-		result.level = 41;
+
+	result.level = dec->base.level;
 
 	result.sps_info_flags = 0;
 	result.sps_info_flags |= pic->pps->sps->direct_8x8_inference_flag << 0;
@@ -338,6 +439,11 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 	memcpy(result.scaling_list_4x4, pic->pps->ScalingList4x4, 6*16);
 	memcpy(result.scaling_list_8x8, pic->pps->ScalingList8x8, 2*64);
 
+	if (dec->stream_type == RUVD_CODEC_H264_PERF) {
+		memcpy(dec->it, result.scaling_list_4x4, 6*16);
+		memcpy((dec->it + 96), result.scaling_list_8x8, 2*64);
+	}
+
 	result.num_ref_frames = pic->num_ref_frames;
 
 	result.num_ref_idx_l0_active_minus1 = pic->num_ref_idx_l0_active_minus1;
@@ -354,6 +460,151 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
 	return result;
 }
 
+/* get h265 specific message bits */
+static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video_buffer *target,
+				     struct pipe_h265_picture_desc *pic)
+{
+	struct ruvd_h265 result;
+	unsigned i;
+
+	memset(&result, 0, sizeof(result));
+
+	result.sps_info_flags = 0;
+	result.sps_info_flags |= pic->pps->sps->scaling_list_enabled_flag << 0;
+	result.sps_info_flags |= pic->pps->sps->amp_enabled_flag << 1;
+	result.sps_info_flags |= pic->pps->sps->sample_adaptive_offset_enabled_flag << 2;
+	result.sps_info_flags |= pic->pps->sps->pcm_enabled_flag << 3;
+	result.sps_info_flags |= pic->pps->sps->pcm_loop_filter_disabled_flag << 4;
+	result.sps_info_flags |= pic->pps->sps->long_term_ref_pics_present_flag << 5;
+	result.sps_info_flags |= pic->pps->sps->sps_temporal_mvp_enabled_flag << 6;
+	result.sps_info_flags |= pic->pps->sps->strong_intra_smoothing_enabled_flag << 7;
+	result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8;
+	if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO)
+		result.sps_info_flags |= 1 << 9;
+
+	result.chroma_format = pic->pps->sps->chroma_format_idc;
+	result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
+	result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8;
+	result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4;
+	result.sps_max_dec_pic_buffering_minus1 = pic->pps->sps->sps_max_dec_pic_buffering_minus1;
+	result.log2_min_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_luma_coding_block_size_minus3;
+	result.log2_diff_max_min_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_luma_coding_block_size;
+	result.log2_min_transform_block_size_minus2 = pic->pps->sps->log2_min_transform_block_size_minus2;
+	result.log2_diff_max_min_transform_block_size = pic->pps->sps->log2_diff_max_min_transform_block_size;
+	result.max_transform_hierarchy_depth_inter = pic->pps->sps->max_transform_hierarchy_depth_inter;
+	result.max_transform_hierarchy_depth_intra = pic->pps->sps->max_transform_hierarchy_depth_intra;
+	result.pcm_sample_bit_depth_luma_minus1 = pic->pps->sps->pcm_sample_bit_depth_luma_minus1;
+	result.pcm_sample_bit_depth_chroma_minus1 = pic->pps->sps->pcm_sample_bit_depth_chroma_minus1;
+	result.log2_min_pcm_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_pcm_luma_coding_block_size_minus3;
+	result.log2_diff_max_min_pcm_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size;
+	result.num_short_term_ref_pic_sets = pic->pps->sps->num_short_term_ref_pic_sets;
+
+	result.pps_info_flags = 0;
+	result.pps_info_flags |= pic->pps->dependent_slice_segments_enabled_flag << 0;
+	result.pps_info_flags |= pic->pps->output_flag_present_flag << 1;
+	result.pps_info_flags |= pic->pps->sign_data_hiding_enabled_flag << 2;
+	result.pps_info_flags |= pic->pps->cabac_init_present_flag << 3;
+	result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 4;
+	result.pps_info_flags |= pic->pps->transform_skip_enabled_flag << 5;
+	result.pps_info_flags |= pic->pps->cu_qp_delta_enabled_flag << 6;
+	result.pps_info_flags |= pic->pps->pps_slice_chroma_qp_offsets_present_flag << 7;
+	result.pps_info_flags |= pic->pps->weighted_pred_flag << 8;
+	result.pps_info_flags |= pic->pps->weighted_bipred_flag << 9;
+	result.pps_info_flags |= pic->pps->transquant_bypass_enabled_flag << 10;
+	result.pps_info_flags |= pic->pps->tiles_enabled_flag << 11;
+	result.pps_info_flags |= pic->pps->entropy_coding_sync_enabled_flag << 12;
+	result.pps_info_flags |= pic->pps->uniform_spacing_flag << 13;
+	result.pps_info_flags |= pic->pps->loop_filter_across_tiles_enabled_flag << 14;
+	result.pps_info_flags |= pic->pps->pps_loop_filter_across_slices_enabled_flag << 15;
+	result.pps_info_flags |= pic->pps->deblocking_filter_override_enabled_flag << 16;
+	result.pps_info_flags |= pic->pps->pps_deblocking_filter_disabled_flag << 17;
+	result.pps_info_flags |= pic->pps->lists_modification_present_flag << 18;
+	result.pps_info_flags |= pic->pps->slice_segment_header_extension_present_flag << 19;
+	//result.pps_info_flags |= pic->pps->deblocking_filter_control_present_flag; ???
+
+	result.num_extra_slice_header_bits = pic->pps->num_extra_slice_header_bits;
+	result.num_long_term_ref_pic_sps = pic->pps->sps->num_long_term_ref_pics_sps;
+	result.num_ref_idx_l0_default_active_minus1 = pic->pps->num_ref_idx_l0_default_active_minus1;
+	result.num_ref_idx_l1_default_active_minus1 = pic->pps->num_ref_idx_l1_default_active_minus1;
+	result.pps_cb_qp_offset = pic->pps->pps_cb_qp_offset;
+	result.pps_cr_qp_offset = pic->pps->pps_cr_qp_offset;
+	result.pps_beta_offset_div2 = pic->pps->pps_beta_offset_div2;
+	result.pps_tc_offset_div2 = pic->pps->pps_tc_offset_div2;
+	result.diff_cu_qp_delta_depth = pic->pps->diff_cu_qp_delta_depth;
+	result.num_tile_columns_minus1 = pic->pps->num_tile_columns_minus1;
+	result.num_tile_rows_minus1 = pic->pps->num_tile_rows_minus1;
+	result.log2_parallel_merge_level_minus2 = pic->pps->log2_parallel_merge_level_minus2;
+	result.init_qp_minus26 = pic->pps->init_qp_minus26;
+
+	for (i = 0; i < 19; ++i)
+		result.column_width_minus1[i] = pic->pps->column_width_minus1[i];
+
+	for (i = 0; i < 21; ++i)
+		result.row_height_minus1[i] = pic->pps->row_height_minus1[i];
+
+	result.num_delta_pocs_ref_rps_idx = pic->NumDeltaPocsOfRefRpsIdx;
+	result.curr_idx = pic->CurrPicOrderCntVal;
+	result.curr_poc = pic->CurrPicOrderCntVal;
+
+	vl_video_buffer_set_associated_data(target, &dec->base,
+					    (void *)(uintptr_t)pic->CurrPicOrderCntVal,
+					    &ruvd_destroy_associated_data);
+
+	for (i = 0; i < 16; ++i) {
+		struct pipe_video_buffer *ref = pic->ref[i];
+		uintptr_t ref_pic = 0;
+
+		result.poc_list[i] = pic->PicOrderCntVal[i];
+
+		if (ref)
+			ref_pic = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base);
+		else
+			ref_pic = 0x7F;
+		result.ref_pic_list[i] = ref_pic;
+	}
+
+	for (i = 0; i < 8; ++i) {
+		result.ref_pic_set_st_curr_before[i] = 0xFF;
+		result.ref_pic_set_st_curr_after[i] = 0xFF;
+		result.ref_pic_set_lt_curr[i] = 0xFF;
+	}
+
+	for (i = 0; i < pic->NumPocStCurrBefore; ++i)
+		result.ref_pic_set_st_curr_before[i] = pic->RefPicSetStCurrBefore[i];
+
+	for (i = 0; i < pic->NumPocStCurrAfter; ++i)
+		result.ref_pic_set_st_curr_after[i] = pic->RefPicSetStCurrAfter[i];
+
+	for (i = 0; i < pic->NumPocLtCurr; ++i)
+		result.ref_pic_set_lt_curr[i] = pic->RefPicSetLtCurr[i];
+
+	for (i = 0; i < 6; ++i)
+		result.ucScalingListDCCoefSizeID2[i] = pic->pps->sps->ScalingListDCCoeff16x16[i];
+
+	for (i = 0; i < 2; ++i)
+		result.ucScalingListDCCoefSizeID3[i] = pic->pps->sps->ScalingListDCCoeff32x32[i];
+
+	memcpy(dec->it, pic->pps->sps->ScalingList4x4, 6 * 16);
+	memcpy(dec->it + 96, pic->pps->sps->ScalingList8x8, 6 * 64);
+	memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64);
+	memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64);
+
+	/* TODO
+	result.highestTid;
+	result.isNonRef;
+
+	IDRPicFlag;
+	RAPPicFlag;
+	NumPocTotalCurr;
+	NumShortTermPictureSliceHeaderBits;
+	NumLongTermPictureSliceHeaderBits;
+
+	IsLongTerm[16];
+	*/
+
+	return result;
+}
+
 /* get vc1 specific message bits */
 static struct ruvd_vc1 get_vc1_msg(struct pipe_vc1_picture_desc *pic)
 {
@@ -556,7 +807,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 
 	assert(decoder);
 
-	map_msg_fb_buf(dec);
+	map_msg_fb_it_buf(dec);
 	memset(dec->msg, 0, sizeof(*dec->msg));
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_DESTROY;
@@ -568,21 +819,17 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
 	dec->ws->cs_destroy(dec->cs);
 
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		rvid_destroy_buffer(&dec->msg_fb_buffers[i]);
+		rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]);
 		rvid_destroy_buffer(&dec->bs_buffers[i]);
 	}
 
 	rvid_destroy_buffer(&dec->dpb);
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
+		rvid_destroy_buffer(&dec->ctx);
 
 	FREE(dec);
 }
 
-/* free associated data in the video buffer callback */
-static void ruvd_destroy_associated_data(void *data)
-{
-	/* NOOP, since we only use an intptr */
-}
-
 /**
  * start decoding of a new frame
  */
@@ -670,7 +917,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 {
 	struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder;
 	struct radeon_winsys_cs_handle *dt;
-	struct rvid_buffer *msg_fb_buf, *bs_buf;
+	struct rvid_buffer *msg_fb_it_buf, *bs_buf;
 	unsigned bs_size;
 
 	assert(decoder);
@@ -678,26 +925,27 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	if (!dec->bs_ptr)
 		return;
 
-	msg_fb_buf = &dec->msg_fb_buffers[dec->cur_buffer];
+	msg_fb_it_buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 	bs_buf = &dec->bs_buffers[dec->cur_buffer];
 
 	bs_size = align(dec->bs_size, 128);
 	memset(dec->bs_ptr, 0, bs_size - dec->bs_size);
 	dec->ws->buffer_unmap(bs_buf->res->cs_buf);
 
-	map_msg_fb_buf(dec);
+	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_DECODE;
 	dec->msg->stream_handle = dec->stream_handle;
 	dec->msg->status_report_feedback_number = dec->frame_number;
 
-	dec->msg->body.decode.stream_type = profile2stream_type(dec->base.profile);
+	dec->msg->body.decode.stream_type = dec->stream_type;
 	dec->msg->body.decode.decode_flags = 0x1;
 	dec->msg->body.decode.width_in_samples = dec->base.width;
 	dec->msg->body.decode.height_in_samples = dec->base.height;
 
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
+	dec->msg->body.decode.db_pitch = dec->base.width;
 
 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
 
@@ -706,6 +954,10 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 		dec->msg->body.decode.codec.h264 = get_h264_msg(dec, (struct pipe_h264_picture_desc*)picture);
 		break;
 
+	case PIPE_VIDEO_FORMAT_HEVC:
+		dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture);
+		break;
+
 	case PIPE_VIDEO_FORMAT_VC1:
 		dec->msg->body.decode.codec.vc1 = get_vc1_msg((struct pipe_vc1_picture_desc*)picture);
 		break;
@@ -733,12 +985,19 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0,
 		 RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
+	if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) {
+		send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0,
+			RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
+	}
 	send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf,
 		 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0,
 		 RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM);
-	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_buf->res->cs_buf,
+	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf,
 		 FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
+	if (have_it(dec))
+		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf,
+			 FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	set_reg(dec, RUVD_ENGINE_CNTL, 1);
 
 	flush(dec);
@@ -760,7 +1019,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 					     ruvd_set_dtb set_dtb)
 {
 	struct radeon_winsys* ws = ((struct r600_common_context *)context)->ws;
-	unsigned dpb_size = calc_dpb_size(templ);
+	struct r600_common_context *rctx = (struct r600_common_context*)context;
+	unsigned dpb_size;
 	unsigned width = templ->width, height = templ->height;
 	unsigned bs_buf_size;
 	struct radeon_info info;
@@ -791,6 +1051,9 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	if (!dec)
 		return NULL;
 
+	if (info.drm_major < 3)
+		dec->use_legacy = TRUE;
+
 	dec->base = *templ;
 	dec->base.context = context;
 	dec->base.width = width;
@@ -803,11 +1066,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->base.end_frame = ruvd_end_frame;
 	dec->base.flush = ruvd_flush;
 
+	dec->stream_type = profile2stream_type(dec, info.family);
 	dec->set_dtb = set_dtb;
 	dec->stream_handle = rvid_alloc_stream_handle();
 	dec->screen = context->screen;
 	dec->ws = ws;
-	dec->cs = ws->cs_create(ws, RING_UVD, NULL, NULL, NULL);
+	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL);
 	if (!dec->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
@@ -815,10 +1079,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	bs_buf_size = width * height * 512 / (16 * 16);
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		unsigned msg_fb_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
+		unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
 		STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET);
-		if (!rvid_create_buffer(dec->screen, &dec->msg_fb_buffers[i],
-					msg_fb_size, PIPE_USAGE_STAGING)) {
+		if (have_it(dec))
+			msg_fb_it_size += IT_SCALING_TABLE_SIZE;
+		if (!rvid_create_buffer(dec->screen, &dec->msg_fb_it_buffers[i],
+					msg_fb_it_size, PIPE_USAGE_STAGING)) {
 			RVID_ERR("Can't allocated message buffers.\n");
 			goto error;
 		}
@@ -829,10 +1095,12 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 			goto error;
 		}
 
-		rvid_clear_buffer(context, &dec->msg_fb_buffers[i]);
+		rvid_clear_buffer(context, &dec->msg_fb_it_buffers[i]);
 		rvid_clear_buffer(context, &dec->bs_buffers[i]);
 	}
 
+	dpb_size = calc_dpb_size(dec);
+
 	if (!rvid_create_buffer(dec->screen, &dec->dpb, dpb_size, PIPE_USAGE_DEFAULT)) {
 		RVID_ERR("Can't allocated dpb.\n");
 		goto error;
@@ -840,14 +1108,23 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	rvid_clear_buffer(context, &dec->dpb);
 
-	map_msg_fb_buf(dec);
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
+		unsigned ctx_size = calc_ctx_size(dec);
+		if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
+			RVID_ERR("Can't allocated context buffer.\n");
+			goto error;
+		}
+		rvid_clear_buffer(context, &dec->ctx);
+	}
+
+	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_CREATE;
 	dec->msg->stream_handle = dec->stream_handle;
-	dec->msg->body.create.stream_type = profile2stream_type(dec->base.profile);
+	dec->msg->body.create.stream_type = dec->stream_type;
 	dec->msg->body.create.width_in_samples = dec->base.width;
 	dec->msg->body.create.height_in_samples = dec->base.height;
-	dec->msg->body.create.dpb_size = dec->dpb.res->buf->size;
+	dec->msg->body.create.dpb_size = dpb_size;
 	send_msg_buf(dec);
 	flush(dec);
 	next_buffer(dec);
@@ -858,11 +1135,13 @@ error:
 	if (dec->cs) dec->ws->cs_destroy(dec->cs);
 
 	for (i = 0; i < NUM_BUFFERS; ++i) {
-		rvid_destroy_buffer(&dec->msg_fb_buffers[i]);
+		rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]);
 		rvid_destroy_buffer(&dec->bs_buffers[i]);
 	}
 
 	rvid_destroy_buffer(&dec->dpb);
+	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
+		rvid_destroy_buffer(&dec->ctx);
 
 	FREE(dec);
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd.h b/src/gallium/drivers/radeon/radeon_uvd.h
index 7442865c9ec..452fbd60880 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -62,6 +62,8 @@
 #define RUVD_CMD_DECODING_TARGET_BUFFER	0x00000002
 #define RUVD_CMD_FEEDBACK_BUFFER	0x00000003
 #define RUVD_CMD_BITSTREAM_BUFFER	0x00000100
+#define RUVD_CMD_ITSCALING_TABLE_BUFFER	0x00000204
+#define RUVD_CMD_CONTEXT_BUFFER		0x00000206
 
 /* UVD message types */
 #define RUVD_MSG_CREATE		0
@@ -73,6 +75,8 @@
 #define RUVD_CODEC_VC1		0x00000001
 #define RUVD_CODEC_MPEG2	0x00000003
 #define RUVD_CODEC_MPEG4	0x00000004
+#define RUVD_CODEC_H264_PERF	0x00000007
+#define RUVD_CODEC_H265		0x00000010
 
 /* UVD decode target buffer tiling mode */
 #define RUVD_TILE_LINEAR	0x00000000
@@ -171,6 +175,66 @@ struct ruvd_h264 {
 	} mvc;
 };
 
+struct ruvd_h265 {
+	uint32_t	sps_info_flags;
+	uint32_t	pps_info_flags;
+
+	uint8_t		chroma_format;
+	uint8_t		bit_depth_luma_minus8;
+	uint8_t		bit_depth_chroma_minus8;
+	uint8_t		log2_max_pic_order_cnt_lsb_minus4;
+
+	uint8_t		sps_max_dec_pic_buffering_minus1;
+	uint8_t		log2_min_luma_coding_block_size_minus3;
+	uint8_t		log2_diff_max_min_luma_coding_block_size;
+	uint8_t		log2_min_transform_block_size_minus2;
+
+	uint8_t		log2_diff_max_min_transform_block_size;
+	uint8_t		max_transform_hierarchy_depth_inter;
+	uint8_t		max_transform_hierarchy_depth_intra;
+	uint8_t		pcm_sample_bit_depth_luma_minus1;
+
+	uint8_t		pcm_sample_bit_depth_chroma_minus1;
+	uint8_t		log2_min_pcm_luma_coding_block_size_minus3;
+	uint8_t		log2_diff_max_min_pcm_luma_coding_block_size;
+	uint8_t		num_extra_slice_header_bits;
+
+	uint8_t		num_short_term_ref_pic_sets;
+	uint8_t		num_long_term_ref_pic_sps;
+	uint8_t		num_ref_idx_l0_default_active_minus1;
+	uint8_t		num_ref_idx_l1_default_active_minus1;
+
+	int8_t		pps_cb_qp_offset;
+	int8_t		pps_cr_qp_offset;
+	int8_t		pps_beta_offset_div2;
+	int8_t		pps_tc_offset_div2;
+
+	uint8_t		diff_cu_qp_delta_depth;
+	uint8_t		num_tile_columns_minus1;
+	uint8_t		num_tile_rows_minus1;
+	uint8_t		log2_parallel_merge_level_minus2;
+
+	uint16_t	column_width_minus1[19];
+	uint16_t	row_height_minus1[21];
+
+	int8_t		init_qp_minus26;
+	uint8_t		num_delta_pocs_ref_rps_idx;
+	uint8_t		curr_idx;
+	uint8_t		reserved1;
+	int32_t		curr_poc;
+	uint8_t		ref_pic_list[16];
+	int32_t		poc_list[16];
+	uint8_t		ref_pic_set_st_curr_before[8];
+	uint8_t		ref_pic_set_st_curr_after[8];
+	uint8_t		ref_pic_set_lt_curr[8];
+
+	uint8_t		ucScalingListDCCoefSizeID2[6];
+	uint8_t		ucScalingListDCCoefSizeID3[2];
+
+	uint8_t		highestTid;
+	uint8_t		isNonRef;
+};
+
 struct ruvd_vc1 {
 	uint32_t	profile;
 	uint32_t	level;
@@ -327,6 +391,7 @@ struct ruvd_msg {
 
 			union {
 				struct ruvd_h264	h264;
+				struct ruvd_h265	h265;
 				struct ruvd_vc1		vc1;
 				struct ruvd_mpeg2	mpeg2;
 				struct ruvd_mpeg4	mpeg4;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index a6567379fe3..7eab974a3df 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -47,6 +47,8 @@
 #define FW_40_2_2 ((40 << 24) | (2 << 16) | (2 << 8))
 #define FW_50_0_1 ((50 << 24) | (0 << 16) | (1 << 8))
 #define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8))
+#define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
+#define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
 
 /**
  * flush commands to the hardware
@@ -54,6 +56,8 @@
 static void flush(struct rvce_encoder *enc)
 {
 	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	enc->task_info_idx = 0;
+	enc->bs_idx = 0;
 }
 
 #if 0
@@ -214,7 +218,7 @@ struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc)
  * Calculate the offsets into the CPB
  */
 void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
-		       unsigned *luma_offset, unsigned *chroma_offset)
+		       signed *luma_offset, signed *chroma_offset)
 {
 	unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128);
 	unsigned vpitch = align(enc->luma->npix_y, 16);
@@ -278,24 +282,19 @@ static void rvce_begin_frame(struct pipe_video_codec *encoder,
 		enc->fb = &fb;
 		enc->session(enc);
 		enc->create(enc);
-		enc->rate_control(enc);
-		need_rate_control = false;
-		enc->config_extension(enc);
-		enc->motion_estimation(enc);
-		enc->rdo(enc);
-		if (enc->use_vui)
-			enc->vui(enc);
-		enc->pic_control(enc);
+		enc->config(enc);
 		enc->feedback(enc);
 		flush(enc);
 		//dump_feedback(enc, &fb);
 		rvid_destroy_buffer(&fb);
+		need_rate_control = false;
 	}
 
-	enc->session(enc);
-
-	if (need_rate_control)
-		enc->rate_control(enc);
+	if (need_rate_control) {
+		enc->session(enc);
+		enc->config(enc);
+		flush(enc);
+	}
 }
 
 static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
@@ -312,6 +311,8 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
 		RVID_ERR("Can't create feedback buffer.\n");
 		return;
 	}
+	if (!enc->cs->cdw)
+		enc->session(enc);
 	enc->encode(enc);
 	enc->feedback(enc);
 }
@@ -324,7 +325,8 @@ static void rvce_end_frame(struct pipe_video_codec *encoder,
 	struct rvce_cpb_slot *slot = LIST_ENTRY(
 		struct rvce_cpb_slot, enc->cpb_slots.prev, list);
 
-	flush(enc);
+	if (!enc->dual_inst || enc->bs_idx > 1)
+		flush(enc);
 
 	/* update the CPB backtrack with the just encoded frame */
 	slot->picture_type = enc->pic.picture_type;
@@ -363,6 +365,9 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
  */
 static void rvce_flush(struct pipe_video_codec *encoder)
 {
+	struct rvce_encoder *enc = (struct rvce_encoder*)encoder;
+
+	flush(enc);
 }
 
 static void rvce_cs_flush(void *ctx, unsigned flags,
@@ -377,6 +382,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 					     rvce_get_buffer get_buffer)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen *)context->screen;
+	struct r600_common_context *rctx = (struct r600_common_context*)context;
 	struct rvce_encoder *enc;
 	struct pipe_video_buffer *tmp_buf, templat = {};
 	struct radeon_surf *tmp_surf;
@@ -395,8 +401,17 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	if (!enc)
 		return NULL;
 
+	if (rscreen->info.drm_major == 3)
+		enc->use_vm = true;
 	if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
 		enc->use_vui = true;
+	if (rscreen->info.family >= CHIP_TONGA)
+		enc->dual_pipe = true;
+	/* TODO enable B frame with dual instance */
+	if ((rscreen->info.family >= CHIP_TONGA) &&
+		(templ->max_references == 1) &&
+		(rscreen->info.vce_harvest_config == 0))
+		enc->dual_inst = true;
 
 	enc->base = *templ;
 	enc->base.context = context;
@@ -411,7 +426,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	enc->screen = context->screen;
 	enc->ws = ws;
-	enc->cs = ws->cs_create(ws, RING_VCE, rvce_cs_flush, enc, NULL);
+	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL);
 	if (!enc->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
@@ -436,6 +451,9 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	cpb_size = cpb_size * align(tmp_surf->npix_y, 16);
 	cpb_size = cpb_size * 3 / 2;
 	cpb_size = cpb_size * enc->cpb_num;
+	if (enc->dual_pipe)
+		cpb_size +=  RVCE_MAX_AUX_BUFFER_NUM *
+			RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2;
 	tmp_buf->destroy(tmp_buf);
 	if (!rvid_create_buffer(enc->screen, &enc->cpb, cpb_size, PIPE_USAGE_DEFAULT)) {
 		RVID_ERR("Can't create CPB buffer.\n");
@@ -455,6 +473,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	case FW_50_0_1:
 	case FW_50_1_2:
+	case FW_50_10_2:
+	case FW_50_17_3:
 		radeon_vce_50_init(enc);
 		break;
 
@@ -482,5 +502,29 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 {
 	return rscreen->info.vce_fw_version == FW_40_2_2 ||
 		rscreen->info.vce_fw_version == FW_50_0_1 ||
-		rscreen->info.vce_fw_version == FW_50_1_2;
+		rscreen->info.vce_fw_version == FW_50_1_2 ||
+		rscreen->info.vce_fw_version == FW_50_10_2 ||
+		rscreen->info.vce_fw_version == FW_50_17_3;
+}
+
+/**
+ * Add the buffer as relocation to the current command submission
+ */
+void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+                     enum radeon_bo_usage usage, enum radeon_bo_domain domain,
+                     signed offset)
+{
+	int reloc_idx;
+
+	reloc_idx = enc->ws->cs_add_reloc(enc->cs, buf, usage, domain, RADEON_PRIO_MIN);
+	if (enc->use_vm) {
+		uint64_t addr;
+		addr = enc->ws->buffer_get_virtual_address(buf);
+		addr = addr + offset;
+		RVCE_CS(addr >> 32);
+		RVCE_CS(addr);
+	} else {
+		RVCE_CS(reloc_idx * 4);
+		RVCE_CS(offset);
+	}
 }
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 8319ef48cd5..624bda479f8 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -36,15 +36,16 @@
 
 #include "util/list.h"
 
-#define RVCE_RELOC(buf, usage, domain) (enc->ws->cs_add_reloc(enc->cs, (buf), (usage), domain, RADEON_PRIO_MIN))
-
 #define RVCE_CS(value) (enc->cs->buf[enc->cs->cdw++] = (value))
 #define RVCE_BEGIN(cmd) { uint32_t *begin = &enc->cs->buf[enc->cs->cdw++]; RVCE_CS(cmd)
-#define RVCE_READ(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_READ, domain) * 4)
-#define RVCE_WRITE(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_WRITE, domain) * 4)
-#define RVCE_READWRITE(buf, domain) RVCE_CS(RVCE_RELOC(buf, RADEON_USAGE_READWRITE, domain) * 4)
+#define RVCE_READ(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READ, (domain), (off))
+#define RVCE_WRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_WRITE, (domain), (off))
+#define RVCE_READWRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READWRITE, (domain), (off))
 #define RVCE_END() *begin = (&enc->cs->buf[enc->cs->cdw] - begin) * 4; }
 
+#define RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE (4096 * 16 * 2.5)
+#define RVCE_MAX_AUX_BUFFER_NUM 4
+
 struct r600_common_screen;
 
 /* driver dependent callback */
@@ -76,8 +77,12 @@ struct rvce_encoder {
 	void (*motion_estimation)(struct rvce_encoder *enc);
 	void (*rdo)(struct rvce_encoder *enc);
 	void (*vui)(struct rvce_encoder *enc);
+	void (*config)(struct rvce_encoder *enc);
 	void (*encode)(struct rvce_encoder *enc);
 	void (*destroy)(struct rvce_encoder *enc);
+	void (*task_info)(struct rvce_encoder *enc, uint32_t op,
+			  uint32_t dep, uint32_t fb_idx,
+			  uint32_t ring_idx);
 
 	unsigned			stream_handle;
 
@@ -101,7 +106,14 @@ struct rvce_encoder {
 	struct rvid_buffer		*fb;
 	struct rvid_buffer		cpb;
 	struct pipe_h264_enc_picture_desc pic;
-	bool use_vui;
+
+	unsigned			task_info_idx;
+	unsigned			bs_idx;
+
+	bool				use_vm;
+	bool				use_vui;
+	bool				dual_pipe;
+	bool				dual_inst;
 };
 
 /* CPB handling functions */
@@ -109,7 +121,7 @@ struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc);
 struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc);
 struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc);
 void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
-		       unsigned *luma_offset, unsigned *chroma_offset);
+		       signed *luma_offset, signed *chroma_offset);
 
 struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 					     const struct pipe_video_codec *templat,
@@ -118,6 +130,10 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen);
 
+void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+		     enum radeon_bo_usage usage, enum radeon_bo_domain domain,
+		     signed offset);
+
 /* init vce fw 40.2.2 specific callbacks */
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc);
 
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 51b17b5f6a8..e64fbc7afb0 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -53,30 +53,38 @@ static void session(struct rvce_encoder *enc)
 	RVCE_END();
 }
 
-static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
+static void task_info(struct rvce_encoder *enc, uint32_t op,
+		      uint32_t dep, uint32_t fb_idx, uint32_t ring_idx)
 {
 	RVCE_BEGIN(0x00000002); // task info
+	if (op == 0x3) {
+		if (enc->task_info_idx) {
+			uint32_t offs = enc->cs->cdw - enc->task_info_idx + 3;
+			// Update offsetOfNextTaskInfo
+			enc->cs->buf[enc->task_info_idx] = offs;
+		}
+		enc->task_info_idx = enc->cs->cdw;
+	}
 	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
-	RVCE_CS(taskOperation); // taskOperation
-	RVCE_CS(0x00000000); // referencePictureDependency
+	RVCE_CS(op); // taskOperation
+	RVCE_CS(dep); // referencePictureDependency
 	RVCE_CS(0x00000000); // collocateFlagDependency
-	RVCE_CS(0x00000000); // feedbackIndex
-	RVCE_CS(0x00000000); // videoBitstreamRingIndex
+	RVCE_CS(fb_idx); // feedbackIndex
+	RVCE_CS(ring_idx); // videoBitstreamRingIndex
 	RVCE_END();
 }
 
 static void feedback(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x05000005); // feedback buffer
-	RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains); // feedbackRingAddressHi
-	RVCE_CS(0x00000000); // feedbackRingAddressLo
+	RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
 	RVCE_CS(0x00000001); // feedbackRingSize
 	RVCE_END();
 }
 
 static void create(struct rvce_encoder *enc)
 {
-	task_info(enc, 0x00000000);
+	enc->task_info(enc, 0x00000000, 0, 0, 0);
 
 	RVCE_BEGIN(0x01000001); // create cmd
 	RVCE_CS(0x00000000); // encUseCircularBuffer
@@ -272,21 +280,31 @@ static void vui(struct rvce_encoder *enc)
 	RVCE_END();
 }
 
+static void config(struct rvce_encoder *enc)
+{
+	enc->task_info(enc, 0x00000002, 0, 0xffffffff, 0);
+	enc->rate_control(enc);
+	enc->config_extension(enc);
+	enc->motion_estimation(enc);
+	enc->rdo(enc);
+	if (enc->use_vui)
+		enc->vui(enc);
+	enc->pic_control(enc);
+}
+
 static void encode(struct rvce_encoder *enc)
 {
+	signed luma_offset, chroma_offset;
 	int i;
-	unsigned luma_offset, chroma_offset;
 
-	task_info(enc, 0x00000003);
+	enc->task_info(enc, 0x00000003, 0, 0, 0);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi
-	RVCE_CS(0x00000000); // encodeContextAddressLo
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
-	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi
-	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, 0x0); // videoBitstreamRingAddressHi/Lo
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
@@ -298,10 +316,10 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // insertAUD
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi
-	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi
-	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		  enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		  enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
 	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
@@ -404,7 +422,7 @@ static void encode(struct rvce_encoder *enc)
 
 static void destroy(struct rvce_encoder *enc)
 {
-	task_info(enc, 0x00000001);
+	enc->task_info(enc, 0x00000001, 0, 0, 0);
 
 	RVCE_BEGIN(0x02000001); // destroy
 	RVCE_END();
@@ -413,6 +431,7 @@ static void destroy(struct rvce_encoder *enc)
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc)
 {
 	enc->session = session;
+	enc->task_info = task_info;
 	enc->create = create;
 	enc->feedback = feedback;
 	enc->rate_control = rate_control;
@@ -421,6 +440,7 @@ void radeon_vce_40_2_2_init(struct rvce_encoder *enc)
 	enc->motion_estimation = motion_estimation;
 	enc->rdo = rdo;
 	enc->vui = vui;
+	enc->config = config;
 	enc->encode = encode;
 	enc->destroy = destroy;
 }
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index 84a2bfb117e..afdab18c0d3 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -44,18 +44,6 @@
 #include "radeon_video.h"
 #include "radeon_vce.h"
 
-static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
-{
-	RVCE_BEGIN(0x00000002); // task info
-	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
-	RVCE_CS(taskOperation); // taskOperation
-	RVCE_CS(0x00000000); // referencePictureDependency
-	RVCE_CS(0x00000000); // collocateFlagDependency
-	RVCE_CS(0x00000000); // feedbackIndex
-	RVCE_CS(0x00000000); // videoBitstreamRingIndex
-	RVCE_END();
-}
-
 static void rate_control(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x04000005); // rate control
@@ -90,22 +78,46 @@ static void rate_control(struct rvce_encoder *enc)
 
 static void encode(struct rvce_encoder *enc)
 {
+	signed luma_offset, chroma_offset, bs_offset;
+	unsigned dep, bs_idx = enc->bs_idx++;
 	int i;
-	unsigned luma_offset, chroma_offset;
 
-	task_info(enc, 0x00000003);
+	if (enc->dual_inst) {
+		if (bs_idx == 0)
+			dep = 1;
+		else if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR)
+			dep = 0;
+		else
+			dep = 2;
+	} else
+		dep = 0;
+
+	enc->task_info(enc, 0x00000003, dep, 0, bs_idx);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi
-	RVCE_CS(0x00000000); // encodeContextAddressLo
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
+	bs_offset = -(signed)(bs_idx * enc->bs_size);
+
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
-	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi
-	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT, bs_offset); // videoBitstreamRingAddressHi/Lo
 	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
 	RVCE_END();
 
+	if (enc->dual_pipe) {
+		unsigned aux_offset = enc->cpb.res->buf->size -
+			RVCE_MAX_AUX_BUFFER_NUM * RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2;
+		RVCE_BEGIN(0x05000002); // auxiliary buffer
+		for (i = 0; i < 8; ++i) {
+			RVCE_CS(aux_offset);
+			aux_offset += RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE;
+		}
+		for (i = 0; i < 8; ++i)
+			RVCE_CS(RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE);
+		RVCE_END();
+	}
+
 	RVCE_BEGIN(0x03000001); // encode
 	RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders
 	RVCE_CS(0x00000000); // pictureStructure
@@ -114,14 +126,17 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // insertAUD
 	RVCE_CS(0x00000000); // endOfSequence
 	RVCE_CS(0x00000000); // endOfStream
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi
-	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
-	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi
-	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
+		enc->chroma->level[0].offset); // inputPictureChromaAddressHi/Lo
 	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
 	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
 	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
-	RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	if (enc->dual_pipe)
+		RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	else
+		RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
 	RVCE_CS(0x00000000); // encInputPicTileConfig
 	RVCE_CS(enc->pic.picture_type); // encPicType
 	RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 826e0763c08..3a1834b948f 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -214,9 +214,9 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	        case PIPE_VIDEO_CAP_NPOT_TEXTURES:
         	        return 1;
 	        case PIPE_VIDEO_CAP_MAX_WIDTH:
-        	        return 2048;
+			return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	        case PIPE_VIDEO_CAP_MAX_HEIGHT:
-        	        return 1152;
+			return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
 	        case PIPE_VIDEO_CAP_PREFERED_FORMAT:
         	        return PIPE_FORMAT_NV12;
 	        case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
@@ -225,6 +225,8 @@ int rvid_get_video_param(struct pipe_screen *screen,
         	        return false;
 	        case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
         	        return true;
+	        case PIPE_VIDEO_CAP_STACKED_FRAMES:
+			return (rscreen->family < CHIP_TONGA) ? 1 : 2;
 	        default:
         	        return 0;
 		}
@@ -262,20 +264,28 @@ int rvid_get_video_param(struct pipe_screen *screen,
 			/* FIXME: VC-1 simple/main profile is broken */
 			return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED &&
 			       entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE;
+		case PIPE_VIDEO_FORMAT_HEVC:
+			/* Carrizo only supports HEVC Main */
+			return rscreen->family >= CHIP_CARRIZO &&
+				   profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
 		default:
 			return false;
 		}
 	case PIPE_VIDEO_CAP_NPOT_TEXTURES:
 		return 1;
 	case PIPE_VIDEO_CAP_MAX_WIDTH:
-		return 2048;
+		return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	case PIPE_VIDEO_CAP_MAX_HEIGHT:
-		return 1152;
+		return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
+			return false; //The hardware doesn't support interlaced HEVC.
 		return true;
 	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
+			return false; //The hardware doesn't support interlaced HEVC.
 		return true;
 	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
 		return true;
@@ -300,6 +310,8 @@ int rvid_get_video_param(struct pipe_screen *screen,
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
 			return 41;
+		case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+			return 186;
 		default:
 			return 0;
 		}
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 3bfbb6d75b7..7ab6e56e099 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -42,12 +42,9 @@
 
 #include "pipebuffer/pb_buffer.h"
 
-#define RADEON_MAX_CMDBUF_DWORDS (16 * 1024)
-
 #define RADEON_FLUSH_ASYNC		(1 << 0)
 #define RADEON_FLUSH_KEEP_TILING_FLAGS	(1 << 1) /* needs DRM 2.12.0 */
-#define RADEON_FLUSH_COMPUTE		(1 << 2)
-#define RADEON_FLUSH_END_OF_FRAME       (1 << 3)
+#define RADEON_FLUSH_END_OF_FRAME       (1 << 2)
 
 /* Tiling flags. */
 enum radeon_bo_layout {
@@ -136,6 +133,10 @@ enum radeon_family {
     CHIP_KABINI,
     CHIP_HAWAII,
     CHIP_MULLINS,
+    CHIP_TONGA,
+    CHIP_ICELAND,
+    CHIP_CARRIZO,
+    CHIP_FIJI,
     CHIP_LAST,
 };
 
@@ -150,10 +151,12 @@ enum chip_class {
     CAYMAN,
     SI,
     CIK,
+    VI,
 };
 
 enum ring_type {
     RING_GFX = 0,
+    RING_COMPUTE,
     RING_DMA,
     RING_UVD,
     RING_VCE,
@@ -169,9 +172,10 @@ enum radeon_value_id {
     RADEON_NUM_BYTES_MOVED,
     RADEON_VRAM_USAGE,
     RADEON_GTT_USAGE,
-    RADEON_GPU_TEMPERATURE,
+    RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */
     RADEON_CURRENT_SCLK,
-    RADEON_CURRENT_MCLK
+    RADEON_CURRENT_MCLK,
+    RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */
 };
 
 enum radeon_bo_priority {
@@ -192,9 +196,11 @@ enum radeon_bo_priority {
 
 struct winsys_handle;
 struct radeon_winsys_cs_handle;
+struct radeon_winsys_ctx;
 
 struct radeon_winsys_cs {
     unsigned                    cdw;  /* Number of used dwords. */
+    unsigned                    max_dw; /* Maximum number of dwords. */
     uint32_t                    *buf; /* The command buffer. */
     enum ring_type              ring_type;
 };
@@ -238,6 +244,7 @@ struct radeon_info {
 
     boolean                     cik_macrotile_mode_array_valid;
     uint32_t                    cik_macrotile_mode_array[16];
+    uint32_t                    vce_harvest_config;
 };
 
 enum radeon_feature_id {
@@ -317,6 +324,8 @@ struct radeon_surf {
     struct radeon_surf_level    stencil_level[RADEON_SURF_MAX_LEVEL];
     uint32_t                    tiling_index[RADEON_SURF_MAX_LEVEL];
     uint32_t                    stencil_tiling_index[RADEON_SURF_MAX_LEVEL];
+    uint32_t                    pipe_config;
+    uint32_t                    num_banks;
 };
 
 struct radeon_winsys {
@@ -398,24 +407,15 @@ struct radeon_winsys {
     void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf);
 
     /**
-     * Return TRUE if a buffer object is being used by the GPU.
+     * Wait for the buffer and return true if the buffer is not used
+     * by the device.
      *
-     * \param buf       A winsys buffer object.
-     * \param usage     Only check whether the buffer is busy for the given usage.
+     * The timeout of 0 will only return the status.
+     * The timeout of PIPE_TIMEOUT_INFINITE will always wait until the buffer
+     * is idle.
      */
-    boolean (*buffer_is_busy)(struct pb_buffer *buf,
-                              enum radeon_bo_usage usage);
-
-    /**
-     * Wait for a buffer object until it is not used by a GPU. This is
-     * equivalent to a fence placed after the last command using the buffer,
-     * and synchronizing to the fence.
-     *
-     * \param buf       A winsys buffer object to wait for.
-     * \param usage     Only wait until the buffer is idle for the given usage,
-     *                  but may still be busy for some other usage.
-     */
-    void (*buffer_wait)(struct pb_buffer *buf, enum radeon_bo_usage usage);
+    bool (*buffer_wait)(struct pb_buffer *buf, uint64_t timeout,
+                        enum radeon_bo_usage usage);
 
     /**
      * Return tiling flags describing a memory layout of a buffer object.
@@ -450,10 +450,11 @@ struct radeon_winsys {
                               struct radeon_winsys_cs *rcs,
                               enum radeon_bo_layout microtile,
                               enum radeon_bo_layout macrotile,
+                              unsigned pipe_config,
                               unsigned bankw, unsigned bankh,
                               unsigned tile_split,
                               unsigned stencil_tile_split,
-                              unsigned mtilea,
+                              unsigned mtilea, unsigned num_banks,
                               unsigned stride,
                               bool scanout);
 
@@ -514,16 +515,32 @@ struct radeon_winsys {
      * commands independently of other contexts.
      *************************************************************************/
 
+    /**
+     * Create a command submission context.
+     * Various command streams can be submitted to the same context.
+     */
+    struct radeon_winsys_ctx *(*ctx_create)(struct radeon_winsys *ws);
+
+    /**
+     * Destroy a context.
+     */
+    void (*ctx_destroy)(struct radeon_winsys_ctx *ctx);
+
+    /**
+     * Query a GPU reset status.
+     */
+    enum pipe_reset_status (*ctx_query_reset_status)(struct radeon_winsys_ctx *ctx);
+
     /**
      * Create a command stream.
      *
-     * \param ws        The winsys this function is called from.
+     * \param ctx       The submission context
      * \param ring_type The ring type (GFX, DMA, UVD)
      * \param flush     Flush callback function associated with the command stream.
      * \param user      User pointer that will be passed to the flush callback.
      * \param trace_buf Trace buffer when tracing is enabled
      */
-    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws,
+    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx,
                                           enum ring_type ring_type,
                                           void (*flush)(void *ctx, unsigned flags,
 							struct pipe_fence_handle **fence),
@@ -668,12 +685,12 @@ struct radeon_winsys {
 };
 
 
-static INLINE void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
+static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
 {
     cs->buf[cs->cdw++] = value;
 }
 
-static INLINE void radeon_emit_array(struct radeon_winsys_cs *cs,
+static inline void radeon_emit_array(struct radeon_winsys_cs *cs,
 				     const uint32_t *values, unsigned count)
 {
     memcpy(cs->buf+cs->cdw, values, count * 4);
diff --git a/src/gallium/drivers/radeonsi/Automake.inc b/src/gallium/drivers/radeonsi/Automake.inc
index 8686fffd71c..5a9dcfd9fd6 100644
--- a/src/gallium/drivers/radeonsi/Automake.inc
+++ b/src/gallium/drivers/radeonsi/Automake.inc
@@ -5,10 +5,12 @@ TARGET_CPPFLAGS += -DGALLIUM_RADEONSI
 TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/drivers/radeonsi/libradeonsi.la \
 	$(RADEON_LIBS) \
-	$(LIBDRM_LIBS)
+	$(LIBDRM_LIBS) \
+	$(AMDGPU_LIBS)
 
 TARGET_RADEON_WINSYS = \
-	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la
+	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la \
+	$(top_builddir)/src/gallium/winsys/amdgpu/drm/libamdgpuwinsys.la
 
 TARGET_RADEON_COMMON = \
 	$(top_builddir)/src/gallium/drivers/radeon/libradeon.la
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 2876c0ae735..a0b1414f4bb 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -3,6 +3,7 @@ C_SOURCES := \
 	si_blit.c \
 	si_commands.c \
 	si_compute.c \
+	si_cp_dma.c \
 	si_descriptors.c \
 	sid.h \
 	si_dma.c \
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 86111cb86e8..47b586f171e 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -27,7 +27,7 @@
 
 #include "sid.h"
 #include "si_pipe.h"
-#include "../radeon/r600_cs.h"
+#include "radeon/r600_cs.h"
 
 #include "util/u_format.h"
 
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 1f2c4082dbc..48972bd170c 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -57,17 +57,19 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
 	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader);
 	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader);
+	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader);
+	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader);
 	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader);
 	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
 	if (sctx->queued.named.sample_mask) {
 		util_blitter_save_sample_mask(sctx->blitter,
 					      sctx->queued.named.sample_mask->sample_mask);
 	}
-	if (sctx->queued.named.viewport) {
-		util_blitter_save_viewport(sctx->blitter, &sctx->queued.named.viewport->viewport);
+	if (sctx->queued.named.viewport[0]) {
+		util_blitter_save_viewport(sctx->blitter, &sctx->queued.named.viewport[0]->viewport);
 	}
-	if (sctx->queued.named.scissor) {
-		util_blitter_save_scissor(sctx->blitter, &sctx->queued.named.scissor->scissor);
+	if (sctx->queued.named.scissor[0]) {
+		util_blitter_save_scissor(sctx->blitter, &sctx->queued.named.scissor[0]->scissor);
 	}
 	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
 	util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets,
@@ -146,7 +148,7 @@ static void si_blit_decompress_depth(struct pipe_context *ctx,
 				struct pipe_surface *zsurf, *cbsurf, surf_tmpl;
 
 				sctx->dbcb_copy_sample = sample;
-				sctx->db_render_state.dirty = true;
+				si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 				surf_tmpl.format = texture->resource.b.b.format;
 				surf_tmpl.u.tex.level = level;
@@ -180,7 +182,7 @@ static void si_blit_decompress_depth(struct pipe_context *ctx,
 
 	sctx->dbcb_depth_copy_enabled = false;
 	sctx->dbcb_stencil_copy_enabled = false;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 static void si_blit_decompress_depth_in_place(struct si_context *sctx,
@@ -192,7 +194,7 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx,
 	unsigned layer, max_layer, checked_last_layer, level;
 
 	sctx->db_inplace_flush_enabled = true;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	surf_tmpl.format = texture->resource.b.b.format;
 
@@ -230,7 +232,7 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx,
 	}
 
 	sctx->db_inplace_flush_enabled = false;
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 void si_flush_depth_textures(struct si_context *sctx,
@@ -340,6 +342,8 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 	if (buffers & PIPE_CLEAR_COLOR) {
 		evergreen_do_fast_color_clear(&sctx->b, fb, &sctx->framebuffer.atom,
 					      &buffers, color);
+		if (!buffers)
+			return; /* all buffers have been fast cleared */
 	}
 
 	if (buffers & PIPE_CLEAR_COLOR) {
@@ -374,9 +378,9 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		}
 
 		zstex->depth_clear_value = depth;
-		sctx->framebuffer.atom.dirty = true; /* updates DB_DEPTH_CLEAR */
+		si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */
 		sctx->db_depth_clear = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 
 	si_blitter_begin(ctx, SI_CLEAR);
@@ -389,7 +393,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		sctx->db_depth_clear = false;
 		sctx->db_depth_disable_expclear = false;
 		zstex->depth_cleared = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 }
 
@@ -455,89 +459,6 @@ struct texture_orig_info {
 	unsigned npix0_y;
 };
 
-static void si_compressed_to_blittable(struct pipe_resource *tex,
-				       unsigned level,
-				       struct texture_orig_info *orig)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-	unsigned pixsize = util_format_get_blocksize(rtex->resource.b.b.format);
-	int new_format;
-	int new_height, new_width;
-
-	orig->format = tex->format;
-	orig->width0 = tex->width0;
-	orig->height0 = tex->height0;
-	orig->npix0_x = rtex->surface.level[0].npix_x;
-	orig->npix0_y = rtex->surface.level[0].npix_y;
-	orig->npix_x = rtex->surface.level[level].npix_x;
-	orig->npix_y = rtex->surface.level[level].npix_y;
-
-	if (pixsize == 8)
-		new_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
-	else
-		new_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
-
-	new_width = util_format_get_nblocksx(tex->format, orig->width0);
-	new_height = util_format_get_nblocksy(tex->format, orig->height0);
-
-	tex->width0 = new_width;
-	tex->height0 = new_height;
-	tex->format = new_format;
-	rtex->surface.level[0].npix_x = util_format_get_nblocksx(orig->format, orig->npix0_x);
-	rtex->surface.level[0].npix_y = util_format_get_nblocksy(orig->format, orig->npix0_y);
-	rtex->surface.level[level].npix_x = util_format_get_nblocksx(orig->format, orig->npix_x);
-	rtex->surface.level[level].npix_y = util_format_get_nblocksy(orig->format, orig->npix_y);
-
-	/* By dividing the dimensions by 4, we effectively decrement
-	 * last_level by 2, therefore the last 2 mipmap levels disappear and
-	 * aren't blittable. Note that the last 3 mipmap levels (4x4, 2x2,
-	 * 1x1) have equal slice sizes, which is an important assumption
-	 * for this to work.
-	 *
-	 * In order to make the last 2 mipmap levels blittable, we have to
-	 * add the slice size of the last mipmap level to the texture
-	 * address, so that even though the hw thinks it reads last_level-2,
-	 * it will actually read last_level-1, and if we add the slice size*2,
-	 * it will read last_level. That's how this workaround works.
-	 */
-	if (level > rtex->resource.b.b.last_level-2)
-		rtex->mipmap_shift = level - (rtex->resource.b.b.last_level-2);
-}
-
-static void si_change_format(struct pipe_resource *tex,
-			     unsigned level,
-			     struct texture_orig_info *orig,
-			     enum pipe_format format)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-
-	orig->format = tex->format;
-	orig->width0 = tex->width0;
-	orig->height0 = tex->height0;
-	orig->npix0_x = rtex->surface.level[0].npix_x;
-	orig->npix0_y = rtex->surface.level[0].npix_y;
-	orig->npix_x = rtex->surface.level[level].npix_x;
-	orig->npix_y = rtex->surface.level[level].npix_y;
-
-	tex->format = format;
-}
-
-static void si_reset_blittable_to_orig(struct pipe_resource *tex,
-				       unsigned level,
-				       struct texture_orig_info *orig)
-{
-	struct r600_texture *rtex = (struct r600_texture*)tex;
-
-	tex->format = orig->format;
-	tex->width0 = orig->width0;
-	tex->height0 = orig->height0;
-	rtex->surface.level[0].npix_x = orig->npix0_x;
-	rtex->surface.level[0].npix_y = orig->npix0_y;
-	rtex->surface.level[level].npix_x = orig->npix_x;
-	rtex->surface.level[level].npix_y = orig->npix_y;
-	rtex->mipmap_shift = 0;
-}
-
 void si_resource_copy_region(struct pipe_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
@@ -547,114 +468,116 @@ void si_resource_copy_region(struct pipe_context *ctx,
 			     const struct pipe_box *src_box)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct r600_texture *rdst = (struct r600_texture*)dst;
 	struct pipe_surface *dst_view, dst_templ;
 	struct pipe_sampler_view src_templ, *src_view;
-	struct texture_orig_info orig_info[2];
+	unsigned dst_width, dst_height, src_width0, src_height0;
+	unsigned src_force_level = 0;
 	struct pipe_box sbox, dstbox;
-	boolean restore_orig[2];
 
-	/* Fallback for buffers. */
+	/* Handle buffers first. */
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 		si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width, false);
 		return;
 	}
 
-	memset(orig_info, 0, sizeof(orig_info));
+	assert(u_max_sample(dst) == u_max_sample(src));
 
 	/* The driver doesn't decompress resources automatically while
 	 * u_blitter is rendering. */
 	si_decompress_subresource(ctx, src, src_level,
 				  src_box->z, src_box->z + src_box->depth - 1);
 
-	restore_orig[0] = restore_orig[1] = FALSE;
+	dst_width = u_minify(dst->width0, dst_level);
+	dst_height = u_minify(dst->height0, dst_level);
+	src_width0 = src->width0;
+	src_height0 = src->height0;
+
+	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
+	util_blitter_default_src_texture(&src_templ, src, src_level);
 
 	if (util_format_is_compressed(src->format) &&
 	    util_format_is_compressed(dst->format)) {
-		si_compressed_to_blittable(src, src_level, &orig_info[0]);
-		restore_orig[0] = TRUE;
-		sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x);
-		sbox.y = util_format_get_nblocksy(orig_info[0].format, src_box->y);
+		unsigned blocksize = util_format_get_blocksize(src->format);
+
+		if (blocksize == 8)
+			src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+		else
+			src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+		dst_templ.format = src_templ.format;
+
+		dst_width = util_format_get_nblocksx(dst->format, dst_width);
+		dst_height = util_format_get_nblocksy(dst->format, dst_height);
+		src_width0 = util_format_get_nblocksx(src->format, src_width0);
+		src_height0 = util_format_get_nblocksy(src->format, src_height0);
+
+		dstx = util_format_get_nblocksx(dst->format, dstx);
+		dsty = util_format_get_nblocksy(dst->format, dsty);
+
+		sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+		sbox.y = util_format_get_nblocksy(src->format, src_box->y);
 		sbox.z = src_box->z;
-		sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width);
-		sbox.height = util_format_get_nblocksy(orig_info[0].format, src_box->height);
+		sbox.width = util_format_get_nblocksx(src->format, src_box->width);
+		sbox.height = util_format_get_nblocksy(src->format, src_box->height);
 		sbox.depth = src_box->depth;
 		src_box = &sbox;
 
-		si_compressed_to_blittable(dst, dst_level, &orig_info[1]);
-		restore_orig[1] = TRUE;
-		/* translate the dst box as well */
-		dstx = util_format_get_nblocksx(orig_info[1].format, dstx);
-		dsty = util_format_get_nblocksy(orig_info[1].format, dsty);
-	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src)) {
+		src_force_level = src_level;
+	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src) ||
+		   /* also *8_SNORM has precision issues, use UNORM instead */
+		   util_format_is_snorm(src->format)) {
 		if (util_format_is_subsampled_422(src->format)) {
-			/* XXX untested */
-			si_change_format(src, src_level, &orig_info[0],
-					 PIPE_FORMAT_R8G8B8A8_UINT);
-			si_change_format(dst, dst_level, &orig_info[1],
-					 PIPE_FORMAT_R8G8B8A8_UINT);
+			src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+			dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
+
+			dst_width = util_format_get_nblocksx(dst->format, dst_width);
+			src_width0 = util_format_get_nblocksx(src->format, src_width0);
+
+			dstx = util_format_get_nblocksx(dst->format, dstx);
 
 			sbox = *src_box;
-			sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x);
-			sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width);
+			sbox.x = util_format_get_nblocksx(src->format, src_box->x);
+			sbox.width = util_format_get_nblocksx(src->format, src_box->width);
 			src_box = &sbox;
-			dstx = util_format_get_nblocksx(orig_info[1].format, dstx);
-
-			restore_orig[0] = TRUE;
-			restore_orig[1] = TRUE;
 		} else {
 			unsigned blocksize = util_format_get_blocksize(src->format);
 
 			switch (blocksize) {
 			case 1:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8_UNORM;
 				break;
 			case 2:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8G8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8G8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8G8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8G8_UNORM;
 				break;
 			case 4:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R8G8B8A8_UNORM);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R8G8B8A8_UNORM);
+				dst_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+				src_templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
 				break;
 			case 8:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R16G16B16A16_UINT);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R16G16B16A16_UINT);
+				dst_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
+				src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT;
 				break;
 			case 16:
-				si_change_format(src, src_level, &orig_info[0],
-						PIPE_FORMAT_R32G32B32A32_UINT);
-				si_change_format(dst, dst_level, &orig_info[1],
-						PIPE_FORMAT_R32G32B32A32_UINT);
+				dst_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
+				src_templ.format = PIPE_FORMAT_R32G32B32A32_UINT;
 				break;
 			default:
 				fprintf(stderr, "Unhandled format %s with blocksize %u\n",
 					util_format_short_name(src->format), blocksize);
 				assert(0);
 			}
-			restore_orig[0] = TRUE;
-			restore_orig[1] = TRUE;
 		}
 	}
 
 	/* Initialize the surface. */
-	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
 	dst_view = r600_create_surface_custom(ctx, dst, &dst_templ,
-					      rdst->surface.level[dst_level].npix_x,
-					      rdst->surface.level[dst_level].npix_y);
+					      dst_width, dst_height);
 
 	/* Initialize the sampler view. */
-	util_blitter_default_src_texture(&src_templ, src, src_level);
-	src_view = ctx->create_sampler_view(ctx, src, &src_templ);
+	src_view = si_create_sampler_view_custom(ctx, src, &src_templ,
+						 src_width0, src_height0,
+						 src_force_level);
 
 	u_box_3d(dstx, dsty, dstz, abs(src_box->width), abs(src_box->height),
 		 abs(src_box->depth), &dstbox);
@@ -662,18 +585,12 @@ void si_resource_copy_region(struct pipe_context *ctx,
 	/* Copy. */
 	si_blitter_begin(ctx, SI_COPY);
 	util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
-				  src_view, src_box, src->width0, src->height0,
+				  src_view, src_box, src_width0, src_height0,
 				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
 	si_blitter_end(ctx);
 
 	pipe_surface_reference(&dst_view, NULL);
 	pipe_sampler_view_reference(&src_view, NULL);
-
-	if (restore_orig[0])
-		si_reset_blittable_to_orig(src, src_level, &orig_info[0]);
-
-	if (restore_orig[1])
-		si_reset_blittable_to_orig(dst, dst_level, &orig_info[1]);
 }
 
 /* For MSAA integer resolving to work, we change the format to NORM using this function. */
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 89bef2e7afd..d4fe5653687 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -137,14 +137,14 @@ static void *si_create_compute_state(
 	}
 #else
 
-	radeon_elf_read(code, header->num_bytes, &program->shader.binary, true);
+	radeon_elf_read(code, header->num_bytes, &program->shader.binary);
 
 	/* init_scratch_buffer patches the shader code with the scratch address,
 	 * so we need to call it before si_shader_binary_read() which uploads
 	 * the shader code to the GPU.
 	 */
 	init_scratch_buffer(sctx, program);
-	si_shader_binary_read(sctx->screen, &program->shader, &program->shader.binary);
+	si_shader_binary_read(sctx->screen, &program->shader);
 
 #endif
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
@@ -309,8 +309,6 @@ static void si_launch_grid(
 			kernel_args[i]);
 	}
 
-	sctx->b.ws->buffer_unmap(input_buffer->cs_buf);
-
 	kernel_args_va = input_buffer->gpu_address;
 	kernel_args_va += kernel_args_offset;
 
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
new file mode 100644
index 00000000000..f8a9da45a10
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "si_pipe.h"
+#include "sid.h"
+#include "radeon/r600_cs.h"
+
+
+/* Set this if you want the 3D engine to wait until CP DMA is done.
+ * It should be set on the last CP DMA packet. */
+#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
+
+/* Set this if the source data was used as a destination in a previous CP DMA
+ * packet. It's for preventing a read-after-write (RAW) hazard between two
+ * CP DMA packets. */
+#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
+#define CIK_CP_DMA_USE_L2	(1 << 2)
+
+/* Emit a CP DMA packet to do a copy from one buffer to another.
+ * The size must fit in bits [20:0].
+ */
+static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
+				       uint64_t dst_va, uint64_t src_va,
+				       unsigned size, unsigned flags)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
+	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
+	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
+			   PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
+
+	assert(size);
+	assert((size & ((1<<21)-1)) == size);
+
+	if (sctx->b.chip_class >= CIK) {
+		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
+		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
+		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
+		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
+		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	} else {
+		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
+		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
+		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	}
+}
+
+/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
+static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
+					uint64_t dst_va, unsigned size,
+					uint32_t clear_value, unsigned flags)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
+	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
+	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
+
+	assert(size);
+	assert((size & ((1<<21)-1)) == size);
+
+	if (sctx->b.chip_class >= CIK) {
+		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+		radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
+		radeon_emit(cs, clear_value);		/* DATA [31:0] */
+		radeon_emit(cs, 0);
+		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	} else {
+		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
+		radeon_emit(cs, clear_value);		/* DATA [31:0] */
+		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
+		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
+		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
+		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+	}
+}
+
+/* The max number of bytes to copy per packet. */
+#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+
+static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
+			    unsigned offset, unsigned size, unsigned value,
+			    bool is_framebuffer)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	unsigned flush_flags, tc_l2_flag;
+
+	if (!size)
+		return;
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
+		       offset + size);
+
+	/* Fallback for unaligned clears. */
+	if (offset % 4 != 0 || size % 4 != 0) {
+		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
+						       sctx->b.rings.gfx.cs,
+						       PIPE_TRANSFER_WRITE);
+		size /= 4;
+		for (unsigned i = 0; i < size; i++)
+			*map++ = value;
+		return;
+	}
+
+	uint64_t va = r600_resource(dst)->gpu_address + offset;
+
+	/* Flush the caches where the resource is bound. */
+	if (is_framebuffer) {
+		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+		tc_l2_flag = 0;
+	} else {
+		flush_flags = SI_CONTEXT_INV_TC_L1 |
+			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
+			      SI_CONTEXT_INV_KCACHE;
+		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	}
+
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			 flush_flags;
+
+	while (size) {
+		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+		unsigned dma_flags = tc_l2_flag;
+
+		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
+				 FALSE);
+
+		/* This must be done after need_cs_space. */
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
+				      RADEON_PRIO_MIN);
+
+		/* Flush the caches for the first copy only.
+		 * Also wait for the previous CP DMA operations. */
+		if (sctx->b.flags) {
+			si_emit_cache_flush(&sctx->b, NULL);
+			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
+		}
+
+		/* Do the synchronization after the last copy, so that all data is written to memory. */
+		if (size == byte_count)
+			dma_flags |= R600_CP_DMA_SYNC;
+
+		/* Emit the clear packet. */
+		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
+
+		size -= byte_count;
+		va += byte_count;
+	}
+
+	/* Flush the caches again in case the 3D engine has been prefetching
+	 * the resource. */
+	sctx->b.flags |= flush_flags;
+
+	if (tc_l2_flag)
+		r600_resource(dst)->TC_L2_dirty = true;
+}
+
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
+		    bool is_framebuffer)
+{
+	unsigned flush_flags, tc_l2_flag;
+
+	if (!size)
+		return;
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
+		       dst_offset + size);
+
+	dst_offset += r600_resource(dst)->gpu_address;
+	src_offset += r600_resource(src)->gpu_address;
+
+	/* Flush the caches where the resource is bound. */
+	if (is_framebuffer) {
+		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+		tc_l2_flag = 0;
+	} else {
+		flush_flags = SI_CONTEXT_INV_TC_L1 |
+			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
+			      SI_CONTEXT_INV_KCACHE;
+		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	}
+
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+			 flush_flags;
+
+	while (size) {
+		unsigned sync_flags = tc_l2_flag;
+		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+
+		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
+
+		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
+		if (sctx->b.flags) {
+			si_emit_cache_flush(&sctx->b, NULL);
+			sync_flags |= SI_CP_DMA_RAW_WAIT;
+		}
+
+		/* Do the synchronization after the last copy, so that all data is written to memory. */
+		if (size == byte_count) {
+			sync_flags |= R600_CP_DMA_SYNC;
+		}
+
+		/* This must be done after r600_need_cs_space. */
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
+				      RADEON_USAGE_READ, RADEON_PRIO_MIN);
+		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
+				      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+
+		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+
+		size -= byte_count;
+		src_offset += byte_count;
+		dst_offset += byte_count;
+	}
+
+	/* Flush the caches again in case the 3D engine has been prefetching
+	 * the resource. */
+	sctx->b.flags |= flush_flags;
+
+	if (tc_l2_flag)
+		r600_resource(dst)->TC_L2_dirty = true;
+}
+
+void si_init_cp_dma_functions(struct si_context *sctx)
+{
+	sctx->b.clear_buffer = si_clear_buffer;
+}
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index bbfd36dcbeb..890be071596 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -24,14 +24,23 @@
  *      Marek Olšák <marek.olsak@amd.com>
  */
 
-/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
- * live in memory on SI.
+/* Resource binding slots and sampler states (each described with 8 or
+ * 4 dwords) are stored in lists in memory which is accessed by shaders
+ * using scalar load instructions.
  *
- * This file is responsible for managing lists of resources and sampler states
- * in memory and binding them, which means updating those structures in memory.
+ * This file is responsible for managing such lists. It keeps a copy of all
+ * descriptors in CPU memory and re-uploads a whole list if some slots have
+ * been changed.
  *
- * There is also code for updating shader pointers to resources and sampler
- * states. CP DMA functions are here too.
+ * This code is also reponsible for updating shader pointers to those lists.
+ *
+ * Note that CP DMA can't be used for updating the lists, because a GPU hang
+ * could leave the list in a mid-IB state and the next IB would get wrong
+ * descriptors and the whole context would be unusable at that point.
+ * (Note: The register shadowing can't be used due to the same reason)
+ *
+ * Also, uploading descriptors to newly allocated memory doesn't require
+ * a KCACHE flush.
  */
 
 #include "radeon/r600_cs.h"
@@ -42,7 +51,6 @@
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
 
-#define SI_NUM_CONTEXTS 16
 
 /* NULL image and buffer descriptor.
  *
@@ -64,284 +72,62 @@ static uint32_t null_descriptor[8] = {
 	 * descriptor */
 };
 
-/* Set this if you want the 3D engine to wait until CP DMA is done.
- * It should be set on the last CP DMA packet. */
-#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
-
-/* Set this if the source data was used as a destination in a previous CP DMA
- * packet. It's for preventing a read-after-write (RAW) hazard between two
- * CP DMA packets. */
-#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
-#define CIK_CP_DMA_USE_L2	(1 << 2)
-
-/* Emit a CP DMA packet to do a copy from one buffer to another.
- * The size must fit in bits [20:0].
- */
-static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
-				       uint64_t dst_va, uint64_t src_va,
-				       unsigned size, unsigned flags)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
-	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
-	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
-			   PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
-
-	assert(size);
-	assert((size & ((1<<21)-1)) == size);
-
-	if (sctx->b.chip_class >= CIK) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
-		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
-		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	} else {
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
-		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
-		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	}
-}
-
-/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
-static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
-					uint64_t dst_va, unsigned size,
-					uint32_t clear_value, unsigned flags)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
-	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
-	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
-
-	assert(size);
-	assert((size & ((1<<21)-1)) == size);
-
-	if (sctx->b.chip_class >= CIK) {
-		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-		radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
-		radeon_emit(cs, clear_value);		/* DATA [31:0] */
-		radeon_emit(cs, 0);
-		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	} else {
-		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-		radeon_emit(cs, clear_value);		/* DATA [31:0] */
-		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
-		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
-		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
-	}
-}
-
-static void si_init_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
-				unsigned shader_userdata_reg,
+static void si_init_descriptors(struct si_descriptors *desc,
+				unsigned shader_userdata_index,
 				unsigned element_dw_size,
-				unsigned num_elements,
-				void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
+				unsigned num_elements)
 {
-	assert(num_elements <= sizeof(desc->enabled_mask)*8);
-	assert(num_elements <= sizeof(desc->dirty_mask)*8);
+	int i;
 
-	desc->atom.emit = (void*)emit_func;
-	desc->shader_userdata_reg = shader_userdata_reg;
+	assert(num_elements <= sizeof(desc->enabled_mask)*8);
+
+	desc->list = CALLOC(num_elements, element_dw_size * 4);
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
-	desc->context_size = num_elements * element_dw_size * 4;
+	desc->list_dirty = true; /* upload the list before the next draw */
+	desc->shader_userdata_offset = shader_userdata_index * 4;
 
-	desc->buffer = (struct r600_resource*)
-		pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_DEFAULT,
-				   SI_NUM_CONTEXTS * desc->context_size);
-
-	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
-			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-
-	/* We don't check for CS space here, because this should be called
-	 * only once at context initialization. */
-	si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
-				    desc->buffer->b.b.width0, 0,
-				    R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
+	/* Initialize the array to NULL descriptors if the element size is 8. */
+	if (element_dw_size == 8)
+		for (i = 0; i < num_elements; i++)
+			memcpy(desc->list + i*element_dw_size, null_descriptor,
+			       sizeof(null_descriptor));
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
 	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
+	FREE(desc->list);
 }
 
-static void si_update_descriptors(struct si_context *sctx,
+static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc)
 {
-	if (desc->dirty_mask) {
-		desc->atom.num_dw =
-			7 + /* copy */
-			(4 + desc->element_dw_size) * util_bitcount64(desc->dirty_mask) + /* update */
-			4; /* pointer update */
+	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+	void *ptr;
 
-		if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
-		    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0)
-			desc->atom.num_dw += 4; /* second pointer update */
+	if (!desc->list_dirty)
+		return true;
 
-		desc->atom.dirty = true;
+	u_upload_alloc(sctx->b.uploader, 0, list_size,
+		       &desc->buffer_offset,
+		       (struct pipe_resource**)&desc->buffer, &ptr);
+	if (!desc->buffer)
+		return false; /* skip the draw call */
 
-		/* TODO: Investigate if these flushes can be removed after
-		 * adding CE support. */
+	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-		/* The descriptors are read with the K cache. */
-		sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
+	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+			      RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-		/* Since SI uses uncached CP DMA to update descriptors,
-		 * we have to flush TC L2, which is used to fetch constants
-		 * along with KCACHE. */
-		if (sctx->b.chip_class == SI)
-			sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
-	} else {
-		desc->atom.dirty = false;
-	}
-}
-
-static void si_emit_shader_pointer(struct si_context *sctx,
-				   struct r600_atom *atom)
-{
-	struct si_descriptors *desc = (struct si_descriptors*)atom;
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va = desc->buffer->gpu_address +
-		      desc->current_context_id * desc->context_size +
-		      desc->buffer_offset;
-
-	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
-	radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
-
-	if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
-	    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) {
-		radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
-		radeon_emit(cs, (desc->shader_userdata_reg +
-				 (R_00B330_SPI_SHADER_USER_DATA_ES_0 -
-				  R_00B130_SPI_SHADER_USER_DATA_VS_0) -
-				 SI_SH_REG_OFFSET) >> 2);
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-	}
-}
-
-static void si_emit_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
-				uint32_t **descriptors)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va_base;
-	int packet_start = 0;
-	int packet_size = 0;
-	int last_index = desc->num_elements; /* point to a non-existing element */
-	uint64_t dirty_mask = desc->dirty_mask;
-	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
-
-	assert(dirty_mask);
-
-	va_base = desc->buffer->gpu_address;
-
-	/* Copy the descriptors to a new context slot. */
-	si_emit_cp_dma_copy_buffer(sctx,
-				   va_base + new_context_id * desc->context_size,
-				   va_base + desc->current_context_id * desc->context_size,
-				   desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
-
-	va_base += new_context_id * desc->context_size;
-
-	/* Update the descriptors.
-	 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
-	 *
-	 * XXX When unbinding lots of resources, consider clearing the memory
-	 *     with CP DMA instead of emitting zeros.
-	 */
-	while (dirty_mask) {
-		int i = u_bit_scan64(&dirty_mask);
-
-		assert(i < desc->num_elements);
-
-		if (last_index+1 == i && packet_size) {
-			/* Append new data at the end of the last packet. */
-			packet_size += desc->element_dw_size;
-			cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
-		} else {
-			/* Start a new packet. */
-			uint64_t va = va_base + i * desc->element_dw_size * 4;
-
-			packet_start = cs->cdw;
-			packet_size = 2 + desc->element_dw_size;
-
-			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
-			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
-						PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
-						PKT3_WRITE_DATA_DST_SEL_TC_L2) |
-					     PKT3_WRITE_DATA_WR_CONFIRM |
-					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
-			radeon_emit(cs, va & 0xFFFFFFFFUL);
-			radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
-		}
-
-		radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
-
-		last_index = i;
-	}
-
-	desc->dirty_mask = 0;
-	desc->current_context_id = new_context_id;
-
-	/* Now update the shader userdata pointer. */
-	si_emit_shader_pointer(sctx, &desc->atom);
-}
-
-static unsigned si_get_shader_user_data_base(unsigned shader)
-{
-	switch (shader) {
-	case PIPE_SHADER_VERTEX:
-		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
-	case PIPE_SHADER_GEOMETRY:
-		return R_00B230_SPI_SHADER_USER_DATA_GS_0;
-	case PIPE_SHADER_FRAGMENT:
-		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
-	default:
-		assert(0);
-		return 0;
-	}
+	desc->list_dirty = false;
+	desc->pointer_dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
+	return true;
 }
 
 /* SAMPLER VIEWS */
 
-static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_views *views = (struct si_sampler_views*)atom;
-
-	si_emit_descriptors(sctx, &views->desc, views->desc_data);
-}
-
-static void si_init_sampler_views(struct si_context *sctx,
-				  struct si_sampler_views *views,
-				  unsigned shader)
-{
-	int i;
-
-	si_init_descriptors(sctx, &views->desc,
-			    si_get_shader_user_data_base(shader) +
-			    SI_SGPR_RESOURCE * 4,
-			    8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
-
-	for (i = 0; i < views->desc.num_elements; i++) {
-		views->desc_data[i] = null_descriptor;
-		views->desc.dirty_mask |= 1llu << i;
-	}
-	si_update_descriptors(sctx, &views->desc);
-}
-
 static void si_release_sampler_views(struct si_sampler_views *views)
 {
 	int i;
@@ -382,10 +168,10 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 				      si_get_resource_ro_priority(rview->resource));
 	}
 
+	if (!views->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &views->desc.atom);
 }
 
 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
@@ -406,17 +192,16 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 				rview->resource, RADEON_USAGE_READ,
 				si_get_resource_ro_priority(rview->resource));
 
-
 		pipe_sampler_view_reference(&views->views[slot], view);
-		views->desc_data[slot] = view_desc;
+		memcpy(views->desc.list + slot*8, view_desc, 8*4);
 		views->desc.enabled_mask |= 1llu << slot;
 	} else {
 		pipe_sampler_view_reference(&views->views[slot], NULL);
-		views->desc_data[slot] = null_descriptor;
+		memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
 		views->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	views->desc.dirty_mask |= 1llu << slot;
+	views->desc.list_dirty = true;
 }
 
 static void si_set_sampler_views(struct pipe_context *ctx,
@@ -475,25 +260,17 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 					    NULL, NULL);
 		}
 	}
-
-	si_update_descriptors(sctx, &samplers->views.desc);
 }
 
 /* SAMPLER STATES */
 
-static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_states *states = (struct si_sampler_states*)atom;
-
-	si_emit_descriptors(sctx, &states->desc, states->desc_data);
-}
-
 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 					   struct si_sampler_states *states)
 {
+	if (!states->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-	si_emit_shader_pointer(sctx, &states->desc.atom);
 }
 
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
@@ -513,66 +290,39 @@ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 	for (i = 0; i < count; i++) {
 		unsigned slot = start + i;
 
-		if (!sstates[i]) {
-			samplers->desc.dirty_mask &= ~(1llu << slot);
+		if (!sstates[i])
 			continue;
-		}
 
-		samplers->desc_data[slot] = sstates[i]->val;
-		samplers->desc.dirty_mask |= 1llu << slot;
+		memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
+		samplers->desc.list_dirty = true;
 	}
-
-	si_update_descriptors(sctx, &samplers->desc);
 }
 
 /* BUFFER RESOURCES */
 
-static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
-
-	si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
-}
-
-static void si_init_buffer_resources(struct si_context *sctx,
-				     struct si_buffer_resources *buffers,
-				     unsigned num_buffers, unsigned shader,
+static void si_init_buffer_resources(struct si_buffer_resources *buffers,
+				     unsigned num_buffers,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_priority priority)
 {
-	int i;
-
-	buffers->num_buffers = num_buffers;
 	buffers->shader_usage = shader_usage;
 	buffers->priority = priority;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
-	buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
 
-	/* si_emit_descriptors only accepts an array of arrays.
-	 * This adds such an array. */
-	buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
-	for (i = 0; i < num_buffers; i++) {
-		buffers->desc_data[i] = &buffers->desc_storage[i*4];
-	}
-
-	si_init_descriptors(sctx, &buffers->desc,
-			    si_get_shader_user_data_base(shader) +
-			    shader_userdata_index*4, 4, num_buffers,
-			    si_emit_buffer_resources);
+	si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
+			    num_buffers);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
 {
 	int i;
 
-	for (i = 0; i < buffers->num_buffers; i++) {
+	for (i = 0; i < buffers->desc.num_elements; i++) {
 		pipe_resource_reference(&buffers->buffers[i], NULL);
 	}
 
 	FREE(buffers->buffers);
-	FREE(buffers->desc_storage);
-	FREE(buffers->desc_data);
 	si_release_descriptors(&buffers->desc);
 }
 
@@ -590,11 +340,11 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 				      buffers->shader_usage, buffers->priority);
 	}
 
+	if (!buffers->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &buffers->desc.atom);
 }
 
 /* VERTEX BUFFERS */
@@ -617,14 +367,15 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 	}
+
+	if (!desc->buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &desc->atom);
 }
 
-void si_update_vertex_buffers(struct si_context *sctx)
+static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
 	struct si_descriptors *desc = &sctx->vertex_buffers;
 	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
@@ -632,8 +383,10 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	uint64_t va;
 	uint32_t *ptr;
 
+	if (!sctx->vertex_buffers_dirty)
+		return true;
 	if (!count || !sctx->vertex_elements)
-		return;
+		return true;
 
 	/* Vertex buffer descriptors are the only ones which are uploaded
 	 * directly through a staging buffer and don't go through
@@ -641,13 +394,14 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	 */
 	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+	if (!desc->buffer)
+		return false;
 
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
 
 	assert(count <= SI_NUM_VERTEX_BUFFERS);
-	assert(desc->current_context_id == 0);
 
 	for (i = 0; i < count; i++) {
 		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
@@ -675,7 +429,8 @@ void si_update_vertex_buffers(struct si_context *sctx)
 		desc[0] = va & 0xFFFFFFFF;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(vb->stride);
-		if (vb->stride)
+
+		if (sctx->b.chip_class <= CIK && vb->stride)
 			/* Round up by rounding down and adding 1 */
 			desc[2] = (vb->buffer->width0 - offset -
 				   sctx->vertex_elements->format_size[i]) /
@@ -693,13 +448,14 @@ void si_update_vertex_buffers(struct si_context *sctx)
 		}
 	}
 
-	desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
-	desc->atom.dirty = true;
-
 	/* Don't flush the const cache. It would have a very negative effect
 	 * on performance (confirmed by testing). New descriptors are always
 	 * uploaded to a fresh new buffer, so I don't think flushing the const
 	 * cache is needed. */
+	desc->pointer_dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
+	sctx->vertex_buffers_dirty = false;
+	return true;
 }
 
 
@@ -724,7 +480,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 	if (shader >= SI_NUM_SHADERS)
 		return;
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
@@ -751,7 +507,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		}
 
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(0);
@@ -770,12 +526,11 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* RING BUFFERS */
@@ -784,7 +539,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
-			unsigned element_size, unsigned index_stride)
+			unsigned element_size, unsigned index_stride, uint64_t offset)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
@@ -795,13 +550,13 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 	/* The stride field in the resource descriptor has 14 bits */
 	assert(stride < (1 << 14));
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	if (buffer) {
 		uint64_t va;
 
-		va = r600_resource(buffer)->gpu_address;
+		va = r600_resource(buffer)->gpu_address + offset;
 
 		switch (element_size) {
 		default:
@@ -839,8 +594,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			break;
 		}
 
+		if (sctx->b.chip_class >= VI && stride)
+			num_records *= stride;
+
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(stride) |
@@ -863,12 +621,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* STREAMOUT BUFFERS */
@@ -929,15 +686,21 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			struct pipe_resource *buffer = targets[i]->buffer;
 			uint64_t va = r600_resource(buffer)->gpu_address;
 
-			/* Set the descriptor. */
-			uint32_t *desc = buffers->desc_data[bufidx];
+			/* Set the descriptor.
+			 *
+			 * On VI, the format must be non-INVALID, otherwise
+			 * the buffer will be considered not bound and store
+			 * instructions will be no-ops.
+			 */
+			uint32_t *desc = buffers->desc.list + bufidx*4;
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
 			desc[2] = 0xffffffff;
 			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
+				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 
 			/* Set the resource. */
 			pipe_resource_reference(&buffers->buffers[bufidx],
@@ -948,24 +711,22 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			buffers->desc.enabled_mask |= 1llu << bufidx;
 		} else {
 			/* Clear the descriptor and unset the resource. */
-			memset(buffers->desc_data[bufidx], 0,
+			memset(buffers->desc.list + bufidx*4, 0,
 			       sizeof(uint32_t) * 4);
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						NULL);
 			buffers->desc.enabled_mask &= ~(1llu << bufidx);
 		}
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 	for (; i < old_num_targets; i++) {
 		bufidx = SI_SO_BUF_OFFSET + i;
 		/* Clear the descriptor and unset the resource. */
-		memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
 		pipe_resource_reference(&buffers->buffers[bufidx], NULL);
 		buffers->desc.enabled_mask &= ~(1llu << bufidx);
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
@@ -1034,22 +795,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Read/Write buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
-
 				if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
 					/* Update the streamout state. */
 					if (sctx->b.streamout.begin_emitted) {
@@ -1061,34 +819,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 				}
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Constant buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
-
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Texture buffers - update virtual addresses in sampler view descriptors. */
@@ -1100,223 +849,211 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Texture buffers - update bindings. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_sampler_views *views = &sctx->samplers[shader].views;
-		bool found = false;
 		uint64_t mask = views->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (views->views[i]->texture == buf) {
+				si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
+							    old_va, buf);
+				views->desc.list_dirty = true;
+
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, RADEON_USAGE_READ,
 						      RADEON_PRIO_SHADER_BUFFER_RO);
-
-				views->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &views->desc);
-		}
 	}
 }
 
-/* CP DMA */
+/* SHADER USER DATA */
 
-/* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
-
-static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
-			    unsigned offset, unsigned size, unsigned value,
-			    bool is_framebuffer)
+static void si_mark_shader_pointers_dirty(struct si_context *sctx,
+					  unsigned shader)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	unsigned flush_flags, tc_l2_flag;
+	sctx->const_buffers[shader].desc.pointer_dirty = true;
+	sctx->rw_buffers[shader].desc.pointer_dirty = true;
+	sctx->samplers[shader].views.desc.pointer_dirty = true;
+	sctx->samplers[shader].states.desc.pointer_dirty = true;
 
-	if (!size)
-		return;
+	if (shader == PIPE_SHADER_VERTEX)
+		sctx->vertex_buffers.pointer_dirty = true;
 
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
-		       offset + size);
-
-	/* Fallback for unaligned clears. */
-	if (offset % 4 != 0 || size % 4 != 0) {
-		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
-						       sctx->b.rings.gfx.cs,
-						       PIPE_TRANSFER_WRITE);
-		size /= 4;
-		for (unsigned i = 0; i < size; i++)
-			*map++ = value;
-		return;
-	}
-
-	uint64_t va = r600_resource(dst)->gpu_address + offset;
-
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
-	}
-
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
-
-	while (size) {
-		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
-		unsigned dma_flags = tc_l2_flag;
-
-		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
-				 FALSE);
-
-		/* This must be done after need_cs_space. */
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
-				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_MIN);
-
-		/* Flush the caches for the first copy only.
-		 * Also wait for the previous CP DMA operations. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(&sctx->b, NULL);
-			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
-		}
-
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count)
-			dma_flags |= R600_CP_DMA_SYNC;
-
-		/* Emit the clear packet. */
-		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
-
-		size -= byte_count;
-		va += byte_count;
-	}
-
-	/* Flush the caches again in case the 3D engine has been prefetching
-	 * the resource. */
-	sctx->b.flags |= flush_flags;
-
-	if (tc_l2_flag)
-		r600_resource(dst)->TC_L2_dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
 }
 
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
-		    bool is_framebuffer)
+static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
 {
-	unsigned flush_flags, tc_l2_flag;
+	int i;
 
-	if (!size)
-		return;
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
-		       dst_offset + size);
-
-	dst_offset += r600_resource(dst)->gpu_address;
-	src_offset += r600_resource(src)->gpu_address;
-
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		si_mark_shader_pointers_dirty(sctx, i);
 	}
-
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
-
-	while (size) {
-		unsigned sync_flags = tc_l2_flag;
-		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
-
-		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
-
-		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(&sctx->b, NULL);
-			sync_flags |= SI_CP_DMA_RAW_WAIT;
-		}
-
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count) {
-			sync_flags |= R600_CP_DMA_SYNC;
-		}
-
-		/* This must be done after r600_need_cs_space. */
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
-				      RADEON_USAGE_READ, RADEON_PRIO_MIN);
-		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
-				      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
-
-		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
-
-		size -= byte_count;
-		src_offset += byte_count;
-		dst_offset += byte_count;
-	}
-
-	/* Flush the caches again in case the 3D engine has been prefetching
-	 * the resource. */
-	sctx->b.flags |= flush_flags;
-
-	if (tc_l2_flag)
-		r600_resource(dst)->TC_L2_dirty = true;
 }
 
-/* INIT/DEINIT */
+/* Set a base register address for user data constants in the given shader.
+ * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
+ */
+static void si_set_user_data_base(struct si_context *sctx,
+				  unsigned shader, uint32_t new_base)
+{
+	uint32_t *base = &sctx->shader_userdata.sh_base[shader];
+
+	if (*base != new_base) {
+		*base = new_base;
+
+		if (new_base)
+			si_mark_shader_pointers_dirty(sctx, shader);
+	}
+}
+
+/* This must be called when these shaders are changed from non-NULL to NULL
+ * and vice versa:
+ * - geometry shader
+ * - tessellation control shader
+ * - tessellation evaluation shader
+ */
+void si_shader_change_notify(struct si_context *sctx)
+{
+	/* VS can be bound as VS, ES, or LS. */
+	if (sctx->tes_shader)
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B530_SPI_SHADER_USER_DATA_LS_0);
+	else if (sctx->gs_shader)
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
+	else
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
+
+	/* TES can be bound as ES, VS, or not bound. */
+	if (sctx->tes_shader) {
+		if (sctx->gs_shader)
+			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
+					      R_00B330_SPI_SHADER_USER_DATA_ES_0);
+		else
+			si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL,
+					      R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	} else {
+		si_set_user_data_base(sctx, PIPE_SHADER_TESS_EVAL, 0);
+	}
+}
+
+static void si_emit_shader_pointer(struct si_context *sctx,
+				   struct si_descriptors *desc,
+				   unsigned sh_base, bool keep_dirty)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint64_t va;
+
+	if (!desc->pointer_dirty || !desc->buffer)
+		return;
+
+	va = desc->buffer->gpu_address +
+	     desc->buffer_offset;
+
+	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
+	radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+
+	desc->pointer_dirty = keep_dirty;
+}
+
+static void si_emit_shader_userdata(struct si_context *sctx,
+				    struct r600_atom *atom)
+{
+	unsigned i;
+	uint32_t *sh_base = sctx->shader_userdata.sh_base;
+
+	if (sctx->gs_shader) {
+		/* The VS copy shader needs these for clipping, streamout, and rings. */
+		unsigned vs_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
+		unsigned i = PIPE_SHADER_VERTEX;
+
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, vs_base, true);
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, vs_base, true);
+
+		/* The TESSEVAL shader needs this for the ESGS ring buffer. */
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc,
+				       R_00B330_SPI_SHADER_USER_DATA_ES_0, true);
+	} else if (sctx->tes_shader) {
+		/* The TESSEVAL shader needs this for streamout. */
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[PIPE_SHADER_VERTEX].desc,
+				       R_00B130_SPI_SHADER_USER_DATA_VS_0, true);
+	}
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		unsigned base = sh_base[i];
+
+		if (!base)
+			continue;
+
+		if (i != PIPE_SHADER_TESS_EVAL)
+			si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
+
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->samplers[i].states.desc, base, false);
+	}
+	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
+}
+
+/* INIT/DEINIT/UPLOAD */
 
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
-					 SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST,
+		si_init_buffer_resources(&sctx->const_buffers[i],
+					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
 					 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
-		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
-					 i == PIPE_SHADER_VERTEX ?
-					 SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS,
-					 i, SI_SGPR_RW_BUFFERS,
+		si_init_buffer_resources(&sctx->rw_buffers[i],
+					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
 
-		si_init_sampler_views(sctx, &sctx->samplers[i].views, i);
-
-		si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
-				    si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4,
-				    4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states);
-
-		sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
-		sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
-		sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
-		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
+		si_init_descriptors(&sctx->samplers[i].views.desc,
+				    SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+		si_init_descriptors(&sctx->samplers[i].states.desc,
+				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
 	}
 
-	si_init_descriptors(sctx, &sctx->vertex_buffers,
-			    si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
-			    SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
-			    si_emit_shader_pointer);
-	sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
+	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+			    4, SI_NUM_VERTEX_BUFFERS);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
 	sctx->b.b.set_sampler_views = si_set_sampler_views;
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
-	sctx->b.clear_buffer = si_clear_buffer;
 	sctx->b.invalidate_buffer = si_invalidate_buffer;
+
+	/* Shader user data. */
+	sctx->atoms.s.shader_userdata = &sctx->shader_userdata.atom;
+	sctx->shader_userdata.atom.emit = (void*)si_emit_shader_userdata;
+
+	/* Upper bound, 4 pointers per shader, +1 for vertex buffers, +2 for the VS copy shader. */
+	sctx->shader_userdata.atom.num_dw = (SI_NUM_SHADERS * 4 + 1 + 2) * 4;
+
+	/* Set default and immutable mappings. */
+	si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_TESS_CTRL, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+}
+
+bool si_upload_shader_descriptors(struct si_context *sctx)
+{
+	int i;
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
+			return false;
+	}
+	return si_upload_vertex_buffer_descriptors(sctx);
 }
 
 void si_release_all_descriptors(struct si_context *sctx)
@@ -1343,4 +1080,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 		si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
 	}
 	si_vertex_buffers_begin_new_cs(sctx);
+	si_shader_userdata_begin_new_cs(sctx);
 }
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 313ced7f5d1..307dc391431 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -30,10 +30,32 @@
 void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 			boolean count_draw_in)
 {
+	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
 	int i;
 
+	/* If the CS is sufficiently large, don't count the space needed
+	 * and just flush if there is less than 8096 dwords left. */
+	if (cs->max_dw >= 24 * 1024) {
+		if (cs->cdw > cs->max_dw - 8 * 1024)
+			ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		return;
+	}
+
+	/* There are two memory usage counters in the winsys for all buffers
+	 * that have been added (cs_add_reloc) and two counters in the pipe
+	 * driver for those that haven't been added yet.
+	 * */
+	if (!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, ctx->b.vram, ctx->b.gtt)) {
+		ctx->b.gtt = 0;
+		ctx->b.vram = 0;
+		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		return;
+	}
+	ctx->b.gtt = 0;
+	ctx->b.vram = 0;
+
 	/* The number of dwords we already used in the CS so far. */
-	num_dw += ctx->b.rings.gfx.cs->cdw;
+	num_dw += cs->cdw;
 
 	if (count_draw_in) {
 		for (i = 0; i < SI_NUM_ATOMS(ctx); i++) {
@@ -50,7 +72,8 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 	}
 
 	/* Count in queries_suspend. */
-	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend;
+	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend +
+		  ctx->b.num_cs_dw_timer_queries_suspend;
 
 	/* Count in streamout_end at the end of CS. */
 	if (ctx->b.streamout.begin_emitted) {
@@ -72,7 +95,7 @@ void si_need_cs_space(struct si_context *ctx, unsigned num_dw,
 #endif
 
 	/* Flush if there's not enough space. */
-	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+	if (num_dw > cs->max_dw) {
 		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }
@@ -82,9 +105,16 @@ void si_context_gfx_flush(void *context, unsigned flags,
 {
 	struct si_context *ctx = context;
 	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys *ws = ctx->b.ws;
 
-	if (cs->cdw == ctx->b.initial_gfx_cs_size && !fence)
+	if (cs->cdw == ctx->b.initial_gfx_cs_size &&
+	    (!fence || ctx->last_gfx_fence)) {
+		if (fence)
+			ws->fence_reference(fence, ctx->last_gfx_fence);
+		if (!(flags & RADEON_FLUSH_ASYNC))
+			ws->cs_sync_flush(cs);
 		return;
+	}
 
 	ctx->b.rings.gfx.flushing = true;
 
@@ -101,9 +131,13 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
 
 	/* Flush the CS. */
-	ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++);
+	ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
+		     ctx->screen->b.cs_count++);
 	ctx->b.rings.gfx.flushing = false;
 
+	if (fence)
+		ws->fence_reference(fence, ctx->last_gfx_fence);
+
 #if SI_TRACE_CS
 	if (ctx->screen->b.trace_bo) {
 		struct si_screen *sscreen = ctx->screen;
@@ -111,7 +145,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 
 		for (i = 0; i < 10; i++) {
 			usleep(5);
-			if (!ctx->b.ws->buffer_is_busy(sscreen->b.trace_bo->buf, RADEON_USAGE_READWRITE)) {
+			if (!ws->buffer_is_busy(sscreen->b.trace_bo->buf, RADEON_USAGE_READWRITE)) {
 				break;
 			}
 		}
@@ -130,7 +164,8 @@ void si_context_gfx_flush(void *context, unsigned flags,
 void si_begin_new_cs(struct si_context *ctx)
 {
 	/* Flush read caches at the beginning of CS. */
-	ctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
+	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
+			SI_CONTEXT_INV_TC_L1 |
 			SI_CONTEXT_INV_TC_L2 |
 			SI_CONTEXT_INV_KCACHE |
 			SI_CONTEXT_INV_ICACHE;
@@ -143,24 +178,32 @@ void si_begin_new_cs(struct si_context *ctx)
 	/* The CS initialization should be emitted before everything else. */
 	si_pm4_emit(ctx, ctx->init_config);
 
-	ctx->clip_regs.dirty = true;
-	ctx->framebuffer.atom.dirty = true;
-	ctx->msaa_sample_locs.dirty = true;
-	ctx->msaa_config.dirty = true;
-	ctx->db_render_state.dirty = true;
-	ctx->b.streamout.enable_atom.dirty = true;
+	si_mark_atom_dirty(ctx, &ctx->clip_regs);
+	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
+	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs);
+	si_mark_atom_dirty(ctx, &ctx->msaa_config);
+	si_mark_atom_dirty(ctx, &ctx->db_render_state);
+	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 	si_all_descriptors_begin_new_cs(ctx);
 
 	r600_postflush_resume_features(&ctx->b);
 
 	ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+
+	/* Invalidate various draw states so that they are emitted before
+	 * the first draw call. */
 	si_invalidate_draw_sh_constants(ctx);
 	ctx->last_primitive_restart_en = -1;
 	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
 	ctx->last_gs_out_prim = -1;
 	ctx->last_prim = -1;
 	ctx->last_multi_vgt_param = -1;
+	ctx->last_ls_hs_config = -1;
 	ctx->last_rast_prim = -1;
 	ctx->last_sc_line_stipple = ~0;
 	ctx->emit_scratch_reloc = true;
+	ctx->last_ls = NULL;
+	ctx->last_tcs = NULL;
+	ctx->last_tes_sh_base = -1;
+	ctx->last_num_tcs_input_cp = -1;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 53ae71a8c92..473a2e9ad12 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -36,32 +36,42 @@
 static void si_destroy_context(struct pipe_context *context)
 {
 	struct si_context *sctx = (struct si_context *)context;
+	int i;
 
 	si_release_all_descriptors(sctx);
 
 	pipe_resource_reference(&sctx->esgs_ring, NULL);
 	pipe_resource_reference(&sctx->gsvs_ring, NULL);
+	pipe_resource_reference(&sctx->tf_ring, NULL);
 	pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
 	r600_resource_reference(&sctx->border_color_table, NULL);
 	r600_resource_reference(&sctx->scratch_buffer, NULL);
+	sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
 	si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
-	si_pm4_delete_state(sctx, gs_onoff, sctx->gs_on);
-	si_pm4_delete_state(sctx, gs_onoff, sctx->gs_off);
+	si_pm4_delete_state(sctx, tf_ring, sctx->tf_state);
+	for (i = 0; i < Elements(sctx->vgt_shader_config); i++)
+		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
 
 	if (sctx->pstipple_sampler_state)
 		sctx->b.b.delete_sampler_state(&sctx->b.b, sctx->pstipple_sampler_state);
-	if (sctx->dummy_pixel_shader) {
+	if (sctx->dummy_pixel_shader)
 		sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader);
-	}
-	sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
-	sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear);
+	if (sctx->fixed_func_tcs_shader)
+		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader);
+	if (sctx->custom_dsa_flush)
+		sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush);
+	if (sctx->custom_blend_resolve)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
+	if (sctx->custom_blend_decompress)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
+	if (sctx->custom_blend_fastclear)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear);
 	util_unreference_framebuffer_state(&sctx->framebuffer.state);
 
-	util_blitter_destroy(sctx->blitter);
+	if (sctx->blitter)
+		util_blitter_destroy(sctx->blitter);
 
 	si_pm4_cleanup(sctx);
 
@@ -74,6 +84,14 @@ static void si_destroy_context(struct pipe_context *context)
 	FREE(sctx);
 }
 
+static enum pipe_reset_status
+si_amdgpu_get_reset_status(struct pipe_context *ctx)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+
+	return sctx->b.ws->ctx_query_reset_status(sctx->b.ctx);
+}
+
 static struct pipe_context *si_create_context(struct pipe_screen *screen, void *priv)
 {
 	struct si_context *sctx = CALLOC_STRUCT(si_context);
@@ -91,13 +109,18 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	sctx->b.b.screen = screen; /* this must be set first */
 	sctx->b.b.priv = priv;
 	sctx->b.b.destroy = si_destroy_context;
+	sctx->b.set_atom_dirty = (void *)si_set_atom_dirty;
 	sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
 
 	if (!r600_common_context_init(&sctx->b, &sscreen->b))
 		goto fail;
 
+	if (sscreen->b.info.drm_major == 3)
+		sctx->b.b.get_device_reset_status = si_amdgpu_get_reset_status;
+
 	si_init_blit_functions(sctx);
 	si_init_compute_functions(sctx);
+	si_init_cp_dma_functions(sctx);
 
 	if (sscreen->b.info.has_uvd) {
 		sctx->b.b.create_video_codec = si_uvd_create_decoder;
@@ -107,7 +130,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 		sctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
-	sctx->b.rings.gfx.cs = ws->cs_create(ws, RING_GFX, si_context_gfx_flush,
+	sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
 					     sctx, sscreen->b.trace_bo ?
 						sscreen->b.trace_bo->cs_buf : NULL);
 	sctx->b.rings.gfx.flush = si_context_gfx_flush;
@@ -127,17 +150,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	sctx->atoms.s.streamout_begin = &sctx->b.streamout.begin_atom;
 	sctx->atoms.s.streamout_enable = &sctx->b.streamout.enable_atom;
 
-	switch (sctx->b.chip_class) {
-	case SI:
-	case CIK:
-		si_init_state_functions(sctx);
-		si_init_shader_functions(sctx);
-		si_init_config(sctx);
-		break;
-	default:
-		R600_ERR("Unsupported chip class %d.\n", sctx->b.chip_class);
-		goto fail;
-	}
+	si_init_state_functions(sctx);
+	si_init_shader_functions(sctx);
 
 	if (sscreen->b.debug_flags & DBG_FORCE_DMA)
 		sctx->b.b.resource_copy_region = sctx->b.dma_copy;
@@ -181,7 +195,9 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 	r600_target = radeon_llvm_get_r600_target(triple);
 	sctx->tm = LLVMCreateTargetMachine(r600_target, triple,
 					   r600_get_llvm_processor_name(sscreen->b.family),
-					   "+DumpCode,+vgpr-spilling",
+					   sctx->b.chip_class >= VI ?
+						   "+DumpCode" :
+						   "+DumpCode,+vgpr-spilling",
 					   LLVMCodeGenLevelDefault,
 					   LLVMRelocDefault,
 					   LLVMCodeModelDefault);
@@ -252,15 +268,27 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
 	case PIPE_CAP_TGSI_TEXCOORD:
+	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 		return !SI_BIG_ENDIAN && sscreen->b.info.has_userptr;
 
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+		return (sscreen->b.info.drm_major == 2 &&
+			sscreen->b.info.drm_minor >= 43) ||
+		       sscreen->b.info.drm_major == 3;
+
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 		/* 2D tiling on CIK is supported since DRM 2.35.0 */
 		return sscreen->b.chip_class < CIK ||
-		       sscreen->b.info.drm_minor >= 35;
+		       (sscreen->b.info.drm_major == 2 &&
+			sscreen->b.info.drm_minor >= 35) ||
+		       sscreen->b.info.drm_major == 3;
 
         case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
                 return R600_MAP_BUFFER_ALIGNMENT;
@@ -270,7 +298,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 4;
 
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
-		return 330;
+		return HAVE_LLVM >= 0x0307 ? 410 : 330;
 
 	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 		return MIN2(sscreen->b.info.vram_size, 0xFFFFFFFF);
@@ -289,13 +317,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
-	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+		return 30;
+
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
 		return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600;
 
@@ -314,7 +342,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
 		return 4095;
 	case PIPE_CAP_MAX_VERTEX_STREAMS:
-		return 1;
+		return 4;
 
 	case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
 		return 2048;
@@ -335,7 +363,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 8;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
-		return 1;
+		return 16;
 
 	/* Timer queries, present when the clock frequency is non zero. */
 	case PIPE_CAP_QUERY_TIMESTAMP:
@@ -375,6 +403,13 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_VERTEX:
 	case PIPE_SHADER_GEOMETRY:
 		break;
+	case PIPE_SHADER_TESS_CTRL:
+	case PIPE_SHADER_TESS_EVAL:
+		/* LLVM 3.6.2 is required for tessellation because of bug fixes there */
+		if (HAVE_LLVM < 0x0306 ||
+		    (HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 2))
+			return 0;
+		break;
 	case PIPE_SHADER_COMPUTE:
 		switch (param) {
 		case PIPE_SHADER_CAP_PREFERRED_IR:
@@ -401,7 +436,6 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		}
 		break;
 	default:
-		/* TODO: support tessellation */
 		return 0;
 	}
 
@@ -433,7 +467,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 		/* Indirection of geometry shader input dimension is not
 		 * handled yet
 		 */
-		return shader < PIPE_SHADER_GEOMETRY;
+		return shader != PIPE_SHADER_GEOMETRY;
 	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
@@ -448,6 +482,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_PREFERRED_IR:
 		return PIPE_SHADER_IR_TGSI;
 	case PIPE_SHADER_CAP_DOUBLES:
+		return HAVE_LLVM >= 0x0307;
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 		return 0;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 2d67342f160..553e1f32683 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -48,7 +48,8 @@
 
 #define SI_MAX_DRAW_CS_DWORDS \
 	(/*scratch:*/ 3 + /*derived prim state:*/ 3 + \
-	 /*draw regs:*/ 16 + /*draw packets:*/ 31)
+	 /*draw regs:*/ 18 + /*draw packets:*/ 31 +\
+	 /*derived tess state:*/ 19)
 
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
@@ -125,8 +126,6 @@ struct si_framebuffer {
 
 #define SI_NUM_ATOMS(sctx) (sizeof((sctx)->atoms)/sizeof((sctx)->atoms.array[0]))
 
-#define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1)
-
 struct si_context {
 	struct r600_common_context	b;
 	struct blitter_context		*blitter;
@@ -137,17 +136,12 @@ struct si_context {
 	void				*pstipple_sampler_state;
 	struct si_screen		*screen;
 	struct si_pm4_state		*init_config;
+	struct pipe_fence_handle	*last_gfx_fence;
+	struct si_shader_selector	*fixed_func_tcs_shader;
 
 	union {
 		struct {
 			/* The order matters. */
-			struct r600_atom *vertex_buffers;
-			struct r600_atom *const_buffers[SI_NUM_SHADERS];
-			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
-			struct r600_atom *sampler_views[SI_NUM_SHADERS];
-			struct r600_atom *sampler_states[SI_NUM_SHADERS];
-			/* Caches must be flushed after resource descriptors are
-			 * updated in memory. */
 			struct r600_atom *cache_flush;
 			struct r600_atom *streamout_begin;
 			struct r600_atom *streamout_enable; /* must be after streamout_begin */
@@ -156,6 +150,7 @@ struct si_context {
 			struct r600_atom *db_render_state;
 			struct r600_atom *msaa_config;
 			struct r600_atom *clip_regs;
+			struct r600_atom *shader_userdata;
 		} s;
 		struct r600_atom *array[0];
 	} atoms;
@@ -168,7 +163,10 @@ struct si_context {
 	struct si_shader_selector	*ps_shader;
 	struct si_shader_selector	*gs_shader;
 	struct si_shader_selector	*vs_shader;
+	struct si_shader_selector	*tcs_shader;
+	struct si_shader_selector	*tes_shader;
 	struct si_cs_shader_state	cs_shader_state;
+	struct si_shader_data		shader_userdata;
 	/* shader information */
 	unsigned			sprite_coord_enable;
 	bool				flatshade;
@@ -194,13 +192,16 @@ struct si_context {
 	/* With rasterizer discard, there doesn't have to be a pixel shader.
 	 * In that case, we bind this one: */
 	void			*dummy_pixel_shader;
-	struct si_pm4_state	*gs_on;
-	struct si_pm4_state	*gs_off;
-	struct si_pm4_state	*gs_rings;
 	struct r600_atom	cache_flush;
 	struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on CIK */
+
+	/* VGT states. */
+	struct si_pm4_state	*vgt_shader_config[4];
+	struct si_pm4_state	*gs_rings;
 	struct pipe_resource	*esgs_ring;
 	struct pipe_resource	*gsvs_ring;
+	struct si_pm4_state	*tf_state;
+	struct pipe_resource	*tf_ring;
 
 	LLVMTargetMachineRef		tm;
 
@@ -218,7 +219,7 @@ struct si_context {
 	bool			db_depth_disable_expclear;
 	unsigned		ps_db_shader_control;
 
-	/* Draw state. */
+	/* Emitted draw state. */
 	int			last_base_vertex;
 	int			last_start_instance;
 	int			last_sh_base_reg;
@@ -227,6 +228,7 @@ struct si_context {
 	int			last_gs_out_prim;
 	int			last_prim;
 	int			last_multi_vgt_param;
+	int			last_ls_hs_config;
 	int			last_rast_prim;
 	unsigned		last_sc_line_stipple;
 	int			current_rast_prim; /* primitive type after TES, GS */
@@ -235,6 +237,12 @@ struct si_context {
 	boolean                 emit_scratch_reloc;
 	unsigned		scratch_waves;
 	unsigned		spi_tmpring_size;
+
+	/* Emitted derived tessellation state. */
+	struct si_shader	*last_ls; /* local shader (VS) */
+	struct si_shader_selector *last_tcs;
+	int			last_num_tcs_input_cp;
+	int			last_tes_sh_base;
 };
 
 /* cik_sdma.c */
@@ -260,6 +268,13 @@ void si_resource_copy_region(struct pipe_context *ctx,
 			     unsigned src_level,
 			     const struct pipe_box *src_box);
 
+/* si_cp_dma.c */
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
+		    bool is_framebuffer);
+void si_init_cp_dma_functions(struct si_context *sctx);
+
 /* si_dma.c */
 void si_dma_copy(struct pipe_context *ctx,
 		 struct pipe_resource *dst,
@@ -293,7 +308,7 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
  * common helpers
  */
 
-static INLINE struct r600_resource *
+static inline struct r600_resource *
 si_resource_create_custom(struct pipe_screen *screen,
 			  unsigned usage, unsigned size)
 {
@@ -302,7 +317,7 @@ si_resource_create_custom(struct pipe_screen *screen,
 		PIPE_BIND_CUSTOM, usage, size));
 }
 
-static INLINE void
+static inline void
 si_invalidate_draw_sh_constants(struct si_context *sctx)
 {
 	sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
@@ -310,4 +325,18 @@ si_invalidate_draw_sh_constants(struct si_context *sctx)
 	sctx->last_sh_base_reg = -1; /* reset to an unknown value */
 }
 
+static inline void
+si_set_atom_dirty(struct si_context *sctx,
+		  struct r600_atom *atom, bool dirty)
+{
+	atom->dirty = dirty;
+}
+
+static inline void
+si_mark_atom_dirty(struct si_context *sctx,
+		   struct r600_atom *atom)
+{
+	si_set_atom_dirty(sctx, atom, true);
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 47e5f96cbed..4288e9b2ab1 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -31,6 +31,7 @@
 #include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_flow.h"
 #include "radeon/r600_cs.h"
 #include "radeon/radeon_llvm.h"
@@ -71,18 +72,25 @@ struct si_shader_context
 	int param_streamout_write_index;
 	int param_streamout_offset[4];
 	int param_vertex_id;
+	int param_rel_auto_id;
+	int param_vs_prim_id;
 	int param_instance_id;
+	int param_tes_u;
+	int param_tes_v;
+	int param_tes_rel_patch_id;
+	int param_tes_patch_id;
+	int param_es2gs_offset;
 	LLVMTargetMachineRef tm;
 	LLVMValueRef const_md;
 	LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
-	LLVMValueRef ddxy_lds;
+	LLVMValueRef lds;
 	LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
 	LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
 	LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
-	LLVMValueRef gsvs_ring;
-	LLVMValueRef gs_next_vertex;
+	LLVMValueRef gsvs_ring[4];
+	LLVMValueRef gs_next_vertex[4];
 };
 
 static struct si_shader_context * si_shader_context(
@@ -129,12 +137,29 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 		assert(index <= 1);
 		return 2 + index;
 	case TGSI_SEMANTIC_GENERIC:
-		assert(index <= 63-4);
-		return 4 + index;
+		if (index <= 63-4)
+			return 4 + index;
+		else
+			/* same explanation as in the default statement,
+			 * the only user hitting this is st/nine.
+			 */
+			return 0;
+
+	/* patch indices are completely separate and thus start from 0 */
+	case TGSI_SEMANTIC_TESSOUTER:
+		return 0;
+	case TGSI_SEMANTIC_TESSINNER:
+		return 1;
+	case TGSI_SEMANTIC_PATCH:
+		return 2 + index;
 
 	default:
-		assert(0);
-		return 63;
+		/* Don't fail here. The result of this function is only used
+		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
+		 * occur, but this function is called for all vertex shaders
+		 * before it's known whether LS will be compiled or not.
+		 */
+		return 0;
 	}
 }
 
@@ -205,6 +230,136 @@ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
 	return value;
 }
 
+static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx)
+{
+	switch (si_shader_ctx->type) {
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 0, 8);
+
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_tes_rel_patch_id);
+
+	default:
+		assert(0);
+		return NULL;
+	}
+}
+
+/* Tessellation shaders pass outputs to the next shader using LDS.
+ *
+ * LS outputs = TCS inputs
+ * TCS outputs = TES inputs
+ *
+ * The LDS layout is:
+ * - TCS inputs for patch 0
+ * - TCS inputs for patch 1
+ * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
+ * - ...
+ * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
+ * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
+ * - TCS outputs for patch 1
+ * - Per-patch TCS outputs for patch 1
+ * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
+ * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
+ * - ...
+ *
+ * All three shaders VS(LS), TCS, TES share the same LDS space.
+ */
+
+static LLVMValueRef
+get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx)
+{
+	if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX)
+		return unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
+	else if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
+		return unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
+	else {
+		assert(0);
+		return NULL;
+	}
+}
+
+static LLVMValueRef
+get_tcs_out_patch_stride(struct si_shader_context *si_shader_ctx)
+{
+	return unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
+}
+
+static LLVMValueRef
+get_tcs_out_patch0_offset(struct si_shader_context *si_shader_ctx)
+{
+	return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
+				unpack_param(si_shader_ctx,
+					     SI_PARAM_TCS_OUT_OFFSETS,
+					     0, 16),
+				4);
+}
+
+static LLVMValueRef
+get_tcs_out_patch0_patch_data_offset(struct si_shader_context *si_shader_ctx)
+{
+	return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
+				unpack_param(si_shader_ctx,
+					     SI_PARAM_TCS_OUT_OFFSETS,
+					     16, 16),
+				4);
+}
+
+static LLVMValueRef
+get_tcs_in_current_patch_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch_stride = get_tcs_in_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
+}
+
+static LLVMValueRef
+get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(si_shader_ctx);
+	LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildAdd(gallivm->builder, patch0_offset,
+			    LLVMBuildMul(gallivm->builder, patch_stride,
+					 rel_patch_id, ""),
+			    "");
+}
+
+static LLVMValueRef
+get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx)
+{
+	struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
+	LLVMValueRef patch0_patch_data_offset =
+		get_tcs_out_patch0_patch_data_offset(si_shader_ctx);
+	LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
+	LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
+
+	return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
+			    LLVMBuildMul(gallivm->builder, patch_stride,
+					 rel_patch_id, ""),
+			    "");
+}
+
+static void build_indexed_store(struct si_shader_context *si_shader_ctx,
+				LLVMValueRef base_ptr, LLVMValueRef index,
+				LLVMValueRef value)
+{
+	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef indices[2], pointer;
+
+	indices[0] = bld_base->uint_bld.zero;
+	indices[1] = index;
+
+	pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
+	LLVMBuildStore(gallivm->builder, value, pointer);
+}
+
 /**
  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
  * It's equivalent to doing a load from &base_ptr[index].
@@ -308,7 +463,7 @@ static void declare_input_vs(
 	args[0] = t_list;
 	args[1] = attribute_offset;
 	args[2] = buffer_index;
-	input = build_intrinsic(gallivm->builder,
+	input = lp_build_intrinsic(gallivm->builder,
 		"llvm.SI.vs.load.input", vec4_type, args, 3,
 		LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
@@ -323,6 +478,285 @@ static void declare_input_vs(
 	}
 }
 
+static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
+				     unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+
+	if (swizzle > 0)
+		return bld_base->uint_bld.zero;
+
+	switch (si_shader_ctx->type) {
+	case TGSI_PROCESSOR_VERTEX:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_vs_prim_id);
+	case TGSI_PROCESSOR_TESS_CTRL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    SI_PARAM_PATCH_ID);
+	case TGSI_PROCESSOR_TESS_EVAL:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    si_shader_ctx->param_tes_patch_id);
+	case TGSI_PROCESSOR_GEOMETRY:
+		return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				    SI_PARAM_PRIMITIVE_ID);
+	default:
+		assert(0);
+		return bld_base->uint_bld.zero;
+	}
+}
+
+/**
+ * Return the value of tgsi_ind_register for indexing.
+ * This is the indirect index with the constant offset added to it.
+ */
+static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx,
+				       const struct tgsi_ind_register *ind,
+				       int rel_index)
+{
+	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+	LLVMValueRef result;
+
+	result = si_shader_ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
+	result = LLVMBuildLoad(gallivm->builder, result, "");
+	result = LLVMBuildAdd(gallivm->builder, result,
+			      lp_build_const_int32(gallivm, rel_index), "");
+	return result;
+}
+
+/**
+ * Calculate a dword address given an input or output register and a stride.
+ */
+static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx,
+				   const struct tgsi_full_dst_register *dst,
+				   const struct tgsi_full_src_register *src,
+				   LLVMValueRef vertex_dw_stride,
+				   LLVMValueRef base_addr)
+{
+	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
+	struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info;
+	ubyte *name, *index, *array_first;
+	int first, param;
+	struct tgsi_full_dst_register reg;
+
+	/* Set the register description. The address computation is the same
+	 * for sources and destinations. */
+	if (src) {
+		reg.Register.File = src->Register.File;
+		reg.Register.Index = src->Register.Index;
+		reg.Register.Indirect = src->Register.Indirect;
+		reg.Register.Dimension = src->Register.Dimension;
+		reg.Indirect = src->Indirect;
+		reg.Dimension = src->Dimension;
+		reg.DimIndirect = src->DimIndirect;
+	} else
+		reg = *dst;
+
+	/* If the register is 2-dimensional (e.g. an array of vertices
+	 * in a primitive), calculate the base address of the vertex. */
+	if (reg.Register.Dimension) {
+		LLVMValueRef index;
+
+		if (reg.Dimension.Indirect)
+			index = get_indirect_index(si_shader_ctx, &reg.DimIndirect,
+						   reg.Dimension.Index);
+		else
+			index = lp_build_const_int32(gallivm, reg.Dimension.Index);
+
+		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+					 LLVMBuildMul(gallivm->builder, index,
+						      vertex_dw_stride, ""), "");
+	}
+
+	/* Get information about the register. */
+	if (reg.Register.File == TGSI_FILE_INPUT) {
+		name = info->input_semantic_name;
+		index = info->input_semantic_index;
+		array_first = info->input_array_first;
+	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
+		name = info->output_semantic_name;
+		index = info->output_semantic_index;
+		array_first = info->output_array_first;
+	} else {
+		assert(0);
+		return NULL;
+	}
+
+	if (reg.Register.Indirect) {
+		/* Add the relative address of the element. */
+		LLVMValueRef ind_index;
+
+		if (reg.Indirect.ArrayID)
+			first = array_first[reg.Indirect.ArrayID];
+		else
+			first = reg.Register.Index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg.Indirect,
+					   reg.Register.Index - first);
+
+		base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
+				    LLVMBuildMul(gallivm->builder, ind_index,
+						 lp_build_const_int32(gallivm, 4), ""), "");
+
+		param = si_shader_io_get_unique_index(name[first], index[first]);
+	} else {
+		param = si_shader_io_get_unique_index(name[reg.Register.Index],
+						      index[reg.Register.Index]);
+	}
+
+	/* Add the base address of the element. */
+	return LLVMBuildAdd(gallivm->builder, base_addr,
+			    lp_build_const_int32(gallivm, param * 4), "");
+}
+
+/**
+ * Load from LDS.
+ *
+ * \param type		output value type
+ * \param swizzle	offset (typically 0..3); it can be ~0, which loads a vec4
+ * \param dw_addr	address in dwords
+ */
+static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
+			     enum tgsi_opcode_type type, unsigned swizzle,
+			     LLVMValueRef dw_addr)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef value;
+
+	if (swizzle == ~0) {
+		LLVMValueRef values[TGSI_NUM_CHANNELS];
+
+		for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
+			values[chan] = lds_load(bld_base, type, chan, dw_addr);
+
+		return lp_build_gather_values(bld_base->base.gallivm, values,
+					      TGSI_NUM_CHANNELS);
+	}
+
+	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+			    lp_build_const_int32(gallivm, swizzle));
+
+	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	return LLVMBuildBitCast(gallivm->builder, value,
+				tgsi2llvmtype(bld_base, type), "");
+}
+
+/**
+ * Store to LDS.
+ *
+ * \param swizzle	offset (typically 0..3)
+ * \param dw_addr	address in dwords
+ * \param value		value to store
+ */
+static void lds_store(struct lp_build_tgsi_context * bld_base,
+		      unsigned swizzle, LLVMValueRef dw_addr,
+		      LLVMValueRef value)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+
+	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+			    lp_build_const_int32(gallivm, swizzle));
+
+	value = LLVMBuildBitCast(gallivm->builder, value,
+				 LLVMInt32TypeInContext(gallivm->context), "");
+	build_indexed_store(si_shader_ctx, si_shader_ctx->lds,
+			    dw_addr, value);
+}
+
+static LLVMValueRef fetch_input_tcs(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *reg,
+	enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef dw_addr, stride;
+
+	stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
+	dw_addr = get_tcs_in_current_patch_offset(si_shader_ctx);
+	dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static LLVMValueRef fetch_output_tcs(
+		struct lp_build_tgsi_context *bld_base,
+		const struct tgsi_full_src_register *reg,
+		enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef dw_addr, stride;
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
+	}
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static LLVMValueRef fetch_input_tes(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *reg,
+	enum tgsi_opcode_type type, unsigned swizzle)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	LLVMValueRef dw_addr, stride;
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
+	}
+
+	return lds_load(bld_base, type, swizzle, dw_addr);
+}
+
+static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
+			     const struct tgsi_full_instruction * inst,
+			     const struct tgsi_opcode_info * info,
+			     LLVMValueRef dst[4])
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
+	unsigned chan_index;
+	LLVMValueRef dw_addr, stride;
+
+	/* Only handle per-patch and per-vertex outputs here.
+	 * Vectors will be lowered to scalars and this function will be called again.
+	 */
+	if (reg->Register.File != TGSI_FILE_OUTPUT ||
+	    (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
+		radeon_llvm_emit_store(bld_base, inst, info, dst);
+		return;
+	}
+
+	if (reg->Register.Dimension) {
+		stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
+		dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, reg, NULL, stride, dw_addr);
+	} else {
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = get_dw_address(si_shader_ctx, reg, NULL, NULL, dw_addr);
+	}
+
+	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
+		LLVMValueRef value = dst[chan_index];
+
+		if (inst->Instruction.Saturate)
+			value = radeon_llvm_saturate(bld_base, value);
+
+		lds_store(bld_base, chan_index, dw_addr, value);
+	}
+}
+
 static LLVMValueRef fetch_input_gs(
 	struct lp_build_tgsi_context *bld_base,
 	const struct tgsi_full_src_register *reg,
@@ -342,13 +776,8 @@ static LLVMValueRef fetch_input_gs(
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
 
-	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) {
-		if (swizzle == 0)
-			return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					    SI_PARAM_PRIMITIVE_ID);
-		else
-			return uint->zero;
-	}
+	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
+		return get_primitive_id(bld_base, swizzle);
 
 	if (!reg->Register.Dimension)
 		return NULL;
@@ -380,7 +809,7 @@ static LLVMValueRef fetch_input_gs(
 	args[1] = vtx_offset;
 	args[2] = lp_build_const_int32(gallivm,
 				       (get_param_index(semantic_name, semantic_index,
-							shader->selector->gs_used_inputs) * 4 +
+							shader->selector->inputs_read) * 4 +
 					swizzle) * 256);
 	args[3] = uint->zero;
 	args[4] = uint->one;  /* OFFEN */
@@ -390,13 +819,42 @@ static LLVMValueRef fetch_input_gs(
 	args[8] = uint->zero; /* TFE */
 
 	return LLVMBuildBitCast(gallivm->builder,
-				build_intrinsic(gallivm->builder,
+				lp_build_intrinsic(gallivm->builder,
 						"llvm.SI.buffer.load.dword.i32.i32",
 						i32, args, 9,
 						LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
 				tgsi2llvmtype(bld_base, type), "");
 }
 
+static int lookup_interp_param_index(unsigned interpolate, unsigned location)
+{
+	switch (interpolate) {
+	case TGSI_INTERPOLATE_CONSTANT:
+		return 0;
+
+	case TGSI_INTERPOLATE_LINEAR:
+		if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
+			return SI_PARAM_LINEAR_SAMPLE;
+		else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
+			return SI_PARAM_LINEAR_CENTROID;
+		else
+			return SI_PARAM_LINEAR_CENTER;
+		break;
+	case TGSI_INTERPOLATE_COLOR:
+	case TGSI_INTERPOLATE_PERSPECTIVE:
+		if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
+			return SI_PARAM_PERSP_SAMPLE;
+		else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
+			return SI_PARAM_PERSP_CENTROID;
+		else
+			return SI_PARAM_PERSP_CENTER;
+		break;
+	default:
+		fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
+		return -1;
+	}
+}
+
 static void declare_input_fs(
 	struct radeon_llvm_context *radeon_bld,
 	unsigned input_index,
@@ -411,7 +869,8 @@ static void declare_input_fs(
 	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
 	LLVMValueRef main_fn = radeon_bld->main_fn;
 
-	LLVMValueRef interp_param;
+	LLVMValueRef interp_param = NULL;
+	int interp_param_idx;
 	const char * intr_name;
 
 	/* This value is:
@@ -460,31 +919,13 @@ static void declare_input_fs(
 	attr_number = lp_build_const_int32(gallivm,
 					   shader->ps_input_param_offset[input_index]);
 
-	switch (decl->Interp.Interpolate) {
-	case TGSI_INTERPOLATE_CONSTANT:
-		interp_param = 0;
-		break;
-	case TGSI_INTERPOLATE_LINEAR:
-		if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_SAMPLE);
-		else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTROID);
-		else
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTER);
-		break;
-	case TGSI_INTERPOLATE_COLOR:
-	case TGSI_INTERPOLATE_PERSPECTIVE:
-		if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_SAMPLE);
-		else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID)
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTROID);
-		else
-			interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTER);
-		break;
-	default:
-		fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
+	shader->ps_input_interpolate[input_index] = decl->Interp.Interpolate;
+	interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
+						     decl->Interp.Location);
+	if (interp_param_idx == -1)
 		return;
-	}
+	else if (interp_param_idx)
+		interp_param = LLVMGetParam(main_fn, interp_param_idx);
 
 	/* fs.constant returns the param from the middle vertex, so it's not
 	 * really useful for flat shading. It's meant to be used for custom
@@ -522,12 +963,12 @@ static void declare_input_fs(
 
 			args[0] = llvm_chan;
 			args[1] = attr_number;
-			front = build_intrinsic(gallivm->builder, intr_name,
+			front = lp_build_intrinsic(gallivm->builder, intr_name,
 						input_type, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
 			args[1] = back_attr_number;
-			back = build_intrinsic(gallivm->builder, intr_name,
+			back = lp_build_intrinsic(gallivm->builder, intr_name,
 					       input_type, args, args[3] ? 4 : 3,
 					       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
@@ -548,7 +989,7 @@ static void declare_input_fs(
 		args[2] = params;
 		args[3] = interp_param;
 		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-			build_intrinsic(gallivm->builder, intr_name,
+			lp_build_intrinsic(gallivm->builder, intr_name,
 					input_type, args, args[3] ? 4 : 3,
 					LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
@@ -566,7 +1007,7 @@ static void declare_input_fs(
 			args[2] = params;
 			args[3] = interp_param;
 			radeon_bld->inputs[soa_index] =
-				build_intrinsic(gallivm->builder, intr_name,
+				lp_build_intrinsic(gallivm->builder, intr_name,
 						input_type, args, args[3] ? 4 : 3,
 						LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 		}
@@ -587,10 +1028,35 @@ static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resou
 {
 	LLVMValueRef args[2] = {resource, offset};
 
-	return build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
+	return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
 			       LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 }
 
+static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
+{
+	struct si_shader_context *si_shader_ctx =
+		si_shader_context(&radeon_bld->soa.bld_base);
+	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
+	struct gallivm_state *gallivm = &radeon_bld->gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+	LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
+	LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
+
+	/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
+	LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
+	LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
+
+	LLVMValueRef pos[4] = {
+		buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
+		buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
+		lp_build_const_float(gallivm, 0),
+		lp_build_const_float(gallivm, 0)
+	};
+
+	return lp_build_gather_values(gallivm, pos, 4);
+}
+
 static void declare_system_value(
 	struct radeon_llvm_context * radeon_bld,
 	unsigned index,
@@ -598,6 +1064,7 @@ static void declare_system_value(
 {
 	struct si_shader_context *si_shader_ctx =
 		si_shader_context(&radeon_bld->soa.bld_base);
+	struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
 	struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 	struct gallivm_state *gallivm = &radeon_bld->gallivm;
 	LLVMValueRef value = 0;
@@ -626,30 +1093,23 @@ static void declare_system_value(
 				     SI_PARAM_BASE_VERTEX);
 		break;
 
+	case TGSI_SEMANTIC_INVOCATIONID:
+		if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
+			value = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
+		else if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY)
+			value = LLVMGetParam(radeon_bld->main_fn,
+					     SI_PARAM_GS_INSTANCE_ID);
+		else
+			assert(!"INVOCATIONID not implemented");
+		break;
+
 	case TGSI_SEMANTIC_SAMPLEID:
 		value = get_sample_id(radeon_bld);
 		break;
 
 	case TGSI_SEMANTIC_SAMPLEPOS:
-	{
-		LLVMBuilderRef builder = gallivm->builder;
-		LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
-		LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
-		LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
-
-		/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
-		LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, get_sample_id(radeon_bld), 8);
-		LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
-
-		LLVMValueRef pos[4] = {
-			buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
-			buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
-			lp_build_const_float(gallivm, 0),
-			lp_build_const_float(gallivm, 0)
-		};
-		value = lp_build_gather_values(gallivm, pos, 4);
+		value = load_sample_position(radeon_bld, get_sample_id(radeon_bld));
 		break;
-	}
 
 	case TGSI_SEMANTIC_SAMPLEMASK:
 		/* Smoothing isn't MSAA in GL, but it's MSAA in hardware.
@@ -660,6 +1120,48 @@ static void declare_system_value(
 			value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
 		break;
 
+	case TGSI_SEMANTIC_TESSCOORD:
+	{
+		LLVMValueRef coord[4] = {
+			LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_u),
+			LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_v),
+			bld->zero,
+			bld->zero
+		};
+
+		/* For triangles, the vector should be (u, v, 1-u-v). */
+		if (si_shader_ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
+		    PIPE_PRIM_TRIANGLES)
+			coord[2] = lp_build_sub(bld, bld->one,
+						lp_build_add(bld, coord[0], coord[1]));
+
+		value = lp_build_gather_values(gallivm, coord, 4);
+		break;
+	}
+
+	case TGSI_SEMANTIC_VERTICESIN:
+		value = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
+		break;
+
+	case TGSI_SEMANTIC_TESSINNER:
+	case TGSI_SEMANTIC_TESSOUTER:
+	{
+		LLVMValueRef dw_addr;
+		int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
+
+		dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+		dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
+				       lp_build_const_int32(gallivm, param * 4), "");
+
+		value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
+				 ~0, dw_addr);
+		break;
+	}
+
+	case TGSI_SEMANTIC_PRIMID:
+		value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
+		break;
+
 	default:
 		assert(!"unknown system value");
 		return;
@@ -679,7 +1181,7 @@ static LLVMValueRef fetch_constant(
 	const struct tgsi_ind_register *ireg = &reg->Indirect;
 	unsigned buf, idx;
 
-	LLVMValueRef addr;
+	LLVMValueRef addr, bufp;
 	LLVMValueRef result;
 
 	if (swizzle == LP_CHAN_ALL) {
@@ -694,8 +1196,24 @@ static LLVMValueRef fetch_constant(
 	buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
 	idx = reg->Register.Index * 4 + swizzle;
 
-	if (!reg->Register.Indirect)
-		return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
+	if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
+		if (type != TGSI_TYPE_DOUBLE)
+			return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
+		else {
+			return radeon_llvm_emit_fetch_double(bld_base,
+							     si_shader_ctx->constants[buf][idx],
+							     si_shader_ctx->constants[buf][idx + 1]);
+		}
+	}
+
+	if (reg->Register.Dimension && reg->Dimension.Indirect) {
+		LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
+		LLVMValueRef index;
+		index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
+						   reg->Dimension.Index);
+		bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
+	} else
+		bufp = si_shader_ctx->const_resource[buf];
 
 	addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 	addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
@@ -703,10 +1221,26 @@ static LLVMValueRef fetch_constant(
 	addr = lp_build_add(&bld_base->uint_bld, addr,
 			    lp_build_const_int32(base->gallivm, idx * 4));
 
-	result = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
-			    addr, base->elem_type);
+	result = buffer_load_const(base->gallivm->builder, bufp,
+				   addr, bld_base->base.elem_type);
 
-	return bitcast(bld_base, type, result);
+	if (type != TGSI_TYPE_DOUBLE)
+		result = bitcast(bld_base, type, result);
+	else {
+		LLVMValueRef addr2, result2;
+		addr2 = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
+		addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
+		addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
+		addr2 = lp_build_add(&bld_base->uint_bld, addr2,
+				     lp_build_const_int32(base->gallivm, idx * 4));
+
+		result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
+				   addr2, bld_base->base.elem_type);
+
+		result = radeon_llvm_emit_fetch_double(bld_base,
+					               result, result2);
+	}
+	return result;
 }
 
 /* Initialize arguments for the shader export intrinsic */
@@ -745,7 +1279,7 @@ static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 			args[0] = values[2 * chan];
 			args[1] = values[2 * chan + 1];
 			args[chan + 5] =
-				build_intrinsic(base->gallivm->builder,
+				lp_build_intrinsic(base->gallivm->builder,
 						"llvm.SI.packf16",
 						LLVMInt32TypeInContext(base->gallivm->context),
 						args, 2,
@@ -827,12 +1361,12 @@ static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 					lp_build_const_float(gallivm, 1.0f),
 					lp_build_const_float(gallivm, -1.0f));
 
-		build_intrinsic(gallivm->builder,
+		lp_build_intrinsic(gallivm->builder,
 				"llvm.AMDGPU.kill",
 				LLVMVoidTypeInContext(gallivm->context),
 				&arg, 1, 0);
 	} else {
-		build_intrinsic(gallivm->builder,
+		lp_build_intrinsic(gallivm->builder,
 				"llvm.AMDGPU.kilp",
 				LLVMVoidTypeInContext(gallivm->context),
 				NULL, 0, 0);
@@ -853,7 +1387,7 @@ static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base
 				SI_PARAM_SAMPLE_COVERAGE);
 	coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
 
-	coverage = build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
+	coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
 				   bld_base->int_bld.elem_type,
 				   &coverage, 1, LLVMReadNoneAttribute);
 
@@ -983,16 +1517,16 @@ static void build_tbuffer_store(struct si_shader_context *shader,
 
 	lp_build_intrinsic(gallivm->builder, name,
 			   LLVMVoidTypeInContext(gallivm->context),
-			   args, Elements(args));
+			   args, Elements(args), 0);
 }
 
-static void build_streamout_store(struct si_shader_context *shader,
-				  LLVMValueRef rsrc,
-				  LLVMValueRef vdata,
-				  unsigned num_channels,
-				  LLVMValueRef vaddr,
-				  LLVMValueRef soffset,
-				  unsigned inst_offset)
+static void build_tbuffer_store_dwords(struct si_shader_context *shader,
+				     LLVMValueRef rsrc,
+				     LLVMValueRef vdata,
+				     unsigned num_channels,
+				     LLVMValueRef vaddr,
+				     LLVMValueRef soffset,
+				     unsigned inst_offset)
 {
 	static unsigned dfmt[] = {
 		V_008F0C_BUF_DATA_FORMAT_32,
@@ -1025,13 +1559,16 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 	LLVMValueRef so_vtx_count =
 		unpack_param(shader, shader->param_streamout_config, 16, 7);
 
-	LLVMValueRef tid = build_intrinsic(builder, "llvm.SI.tid", i32,
+	LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", i32,
 					   NULL, 0, LLVMReadNoneAttribute);
 
 	/* can_emit = tid < so_vtx_count; */
 	LLVMValueRef can_emit =
 		LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
 
+	LLVMValueRef stream_id =
+		unpack_param(shader, shader->param_streamout_config, 24, 2);
+
 	/* Emit the streamout code conditionally. This actually avoids
 	 * out-of-bounds buffer access. The hw tells us via the SGPR
 	 * (so_vtx_count) which threads are allowed to emit streamout data. */
@@ -1071,7 +1608,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 			unsigned reg = so->output[i].register_index;
 			unsigned start = so->output[i].start_component;
 			unsigned num_comps = so->output[i].num_components;
+			unsigned stream = so->output[i].stream;
 			LLVMValueRef out[4];
+			struct lp_build_if_state if_ctx_stream;
 
 			assert(num_comps && num_comps <= 4);
 			if (!num_comps || num_comps > 4)
@@ -1105,11 +1644,18 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
 				break;
 			}
 
-			build_streamout_store(shader, shader->so_buffers[buf_idx],
-					      vdata, num_comps,
-					      so_write_offset[buf_idx],
-					      LLVMConstInt(i32, 0, 0),
-					      so->output[i].dst_offset*4);
+			LLVMValueRef can_emit_stream =
+				LLVMBuildICmp(builder, LLVMIntEQ,
+					      stream_id,
+					      lp_build_const_int32(gallivm, stream), "");
+
+			lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
+			build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx],
+						   vdata, num_comps,
+						   so_write_offset[buf_idx],
+						   LLVMConstInt(i32, 0, 0),
+						   so->output[i].dst_offset*4);
+			lp_build_endif(&if_ctx_stream);
 		}
 	}
 	lp_build_endif(&if_ctx);
@@ -1128,7 +1674,7 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
 				&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 	LLVMValueRef args[9];
 	LLVMValueRef pos_args[4][9] = { { 0 } };
-	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL;
+	LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
 	unsigned semantic_name, semantic_index;
 	unsigned target;
 	unsigned param_count = 0;
@@ -1154,7 +1700,12 @@ handle_semantic:
 			continue;
 		case TGSI_SEMANTIC_LAYER:
 			layer_value = outputs[i].values[0];
-			continue;
+			semantic_name = TGSI_SEMANTIC_GENERIC;
+			goto handle_semantic;
+		case TGSI_SEMANTIC_VIEWPORT_INDEX:
+			viewport_index_value = outputs[i].values[0];
+			semantic_name = TGSI_SEMANTIC_GENERIC;
+			goto handle_semantic;
 		case TGSI_SEMANTIC_POSITION:
 			target = V_008DFC_SQ_EXP_POS;
 			break;
@@ -1195,7 +1746,7 @@ handle_semantic:
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		}
 
 		if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
@@ -1204,6 +1755,8 @@ handle_semantic:
 		}
 	}
 
+	shader->nr_param_exports = param_count;
+
 	/* We need to add the position output manually if it's missing. */
 	if (!pos_args[0][0]) {
 		pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
@@ -1220,11 +1773,13 @@ handle_semantic:
 	/* Write the misc vector (point size, edgeflag, layer, viewport). */
 	if (shader->selector->info.writes_psize ||
 	    shader->selector->info.writes_edgeflag ||
+	    shader->selector->info.writes_viewport_index ||
 	    shader->selector->info.writes_layer) {
 		pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
 						      shader->selector->info.writes_psize |
 						      (shader->selector->info.writes_edgeflag << 1) |
-						      (shader->selector->info.writes_layer << 2));
+						      (shader->selector->info.writes_layer << 2) |
+						      (shader->selector->info.writes_viewport_index << 3));
 		pos_args[1][1] = uint->zero; /* EXEC mask */
 		pos_args[1][2] = uint->zero; /* last export? */
 		pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
@@ -1255,6 +1810,9 @@ handle_semantic:
 
 		if (shader->selector->info.writes_layer)
 			pos_args[1][7] = layer_value;
+
+		if (shader->selector->info.writes_viewport_index)
+			pos_args[1][8] = viewport_index_value;
 	}
 
 	for (i = 0; i < 4; i++)
@@ -1276,7 +1834,133 @@ handle_semantic:
 		lp_build_intrinsic(base->gallivm->builder,
 				   "llvm.SI.export",
 				   LLVMVoidTypeInContext(base->gallivm->context),
-				   pos_args[i], 9);
+				   pos_args[i], 9, 0);
+	}
+}
+
+/* This only writes the tessellation factor levels. */
+static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	struct si_shader *shader = si_shader_ctx->shader;
+	unsigned tess_inner_index, tess_outer_index;
+	LLVMValueRef lds_base, lds_inner, lds_outer;
+	LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers;
+	LLVMValueRef out[6], vec0, vec1, invocation_id;
+	unsigned stride, outer_comps, inner_comps, i;
+	struct lp_build_if_state if_ctx;
+
+	invocation_id = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
+
+	/* Do this only for invocation 0, because the tess levels are per-patch,
+	 * not per-vertex.
+	 *
+	 * This can't jump, because invocation 0 executes this. It should
+	 * at least mask out the loads and stores for other invocations.
+	 */
+	lp_build_if(&if_ctx, gallivm,
+		    LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
+				  invocation_id, bld_base->uint_bld.zero, ""));
+
+	/* Determine the layout of one tess factor element in the buffer. */
+	switch (shader->key.tcs.prim_mode) {
+	case PIPE_PRIM_LINES:
+		stride = 2; /* 2 dwords, 1 vec2 store */
+		outer_comps = 2;
+		inner_comps = 0;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		stride = 4; /* 4 dwords, 1 vec4 store */
+		outer_comps = 3;
+		inner_comps = 1;
+		break;
+	case PIPE_PRIM_QUADS:
+		stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
+		outer_comps = 4;
+		inner_comps = 2;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	/* Load tess_inner and tess_outer from LDS.
+	 * Any invocation can write them, so we can't get them from a temporary.
+	 */
+	tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
+	tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
+
+	lds_base = get_tcs_out_current_patch_data_offset(si_shader_ctx);
+	lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
+				 lp_build_const_int32(gallivm,
+						      tess_inner_index * 4), "");
+	lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
+				 lp_build_const_int32(gallivm,
+						      tess_outer_index * 4), "");
+
+	for (i = 0; i < outer_comps; i++)
+		out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
+	for (i = 0; i < inner_comps; i++)
+		out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
+
+	/* Convert the outputs to vectors for stores. */
+	vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
+	vec1 = NULL;
+
+	if (stride > 4)
+		vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
+
+	/* Get the buffer. */
+	rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+				  SI_PARAM_RW_BUFFERS);
+	buffer = build_indexed_load_const(si_shader_ctx, rw_buffers,
+			lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR));
+
+	/* Get the offset. */
+	tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+			       SI_PARAM_TESS_FACTOR_OFFSET);
+	rel_patch_id = get_rel_patch_id(si_shader_ctx);
+	byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
+				  lp_build_const_int32(gallivm, 4 * stride), "");
+
+	/* Store the outputs. */
+	build_tbuffer_store_dwords(si_shader_ctx, buffer, vec0,
+				   MIN2(stride, 4), byteoffset, tf_base, 0);
+	if (vec1)
+		build_tbuffer_store_dwords(si_shader_ctx, buffer, vec1,
+					   stride - 4, byteoffset, tf_base, 16);
+	lp_build_endif(&if_ctx);
+}
+
+static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct tgsi_shader_info *info = &shader->selector->info;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	unsigned i, chan;
+	LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
+					      si_shader_ctx->param_rel_auto_id);
+	LLVMValueRef vertex_dw_stride =
+		unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
+	LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
+						 vertex_dw_stride, "");
+
+	/* Write outputs to LDS. The next shader (TCS aka HS) will read
+	 * its inputs from it. */
+	for (i = 0; i < info->num_outputs; i++) {
+		LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
+		unsigned name = info->output_semantic_name[i];
+		unsigned index = info->output_semantic_index[i];
+		int param = si_shader_io_get_unique_index(name, index);
+		LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
+					lp_build_const_int32(gallivm, param * 4), "");
+
+		for (chan = 0; chan < 4; chan++) {
+			lds_store(bld_base, chan, dw_addr,
+				  LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
+		}
 	}
 }
 
@@ -1288,17 +1972,25 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct tgsi_shader_info *info = &es->selector->info;
 	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
-					    SI_PARAM_ES2GS_OFFSET);
+					    si_shader_ctx->param_es2gs_offset);
+	uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ?
+					   es->key.tes.es_enabled_outputs :
+					   es->key.vs.es_enabled_outputs;
 	unsigned chan;
 	int i;
 
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
 			si_shader_ctx->radeon_bld.soa.outputs[i];
-		int param_index = get_param_index(info->output_semantic_name[i],
-						  info->output_semantic_index[i],
-						  es->key.vs.gs_used_inputs);
+		int param_index;
 
+		if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
+		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
+			continue;
+
+		param_index = get_param_index(info->output_semantic_name[i],
+					      info->output_semantic_index[i],
+					      enabled_outputs);
 		if (param_index < 0)
 			continue;
 
@@ -1326,7 +2018,7 @@ static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
 
 	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
@@ -1339,7 +2031,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 	struct si_shader_output_values *outputs = NULL;
 	int i,j;
 
-	outputs = MALLOC(info->num_outputs * sizeof(outputs[0]));
+	outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
 	for (i = 0; i < info->num_outputs; i++) {
 		outputs[i].name = info->output_semantic_name[i];
@@ -1352,7 +2044,19 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
 					      "");
 	}
 
-	si_llvm_export_vs(bld_base, outputs, info->num_outputs);
+	/* Export PrimitiveID when PS needs it. */
+	if (si_vs_exports_prim_id(si_shader_ctx->shader)) {
+		outputs[i].name = TGSI_SEMANTIC_PRIMID;
+		outputs[i].sid = 0;
+		outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
+					       get_primitive_id(bld_base, 0));
+		outputs[i].values[1] = bld_base->base.undef;
+		outputs[i].values[2] = bld_base->base.undef;
+		outputs[i].values[3] = bld_base->base.undef;
+		i++;
+	}
+
+	si_llvm_export_vs(bld_base, outputs, i);
 	FREE(outputs);
 }
 
@@ -1417,7 +2121,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 				lp_build_intrinsic(base->gallivm->builder,
 						   "llvm.SI.export",
 						   LLVMVoidTypeInContext(base->gallivm->context),
-						   last_args, 9);
+						   last_args, 9, 0);
 			}
 
 			/* This instruction will be emitted at the end of the shader. */
@@ -1434,14 +2138,14 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 					lp_build_intrinsic(base->gallivm->builder,
 							   "llvm.SI.export",
 							   LLVMVoidTypeInContext(base->gallivm->context),
-							   args, 9);
+							   args, 9, 0);
 				}
 			}
 		} else {
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		}
 	}
 
@@ -1503,7 +2207,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 			lp_build_intrinsic(base->gallivm->builder,
 					   "llvm.SI.export",
 					   LLVMVoidTypeInContext(base->gallivm->context),
-					   args, 9);
+					   args, 9, 0);
 		else
 			memcpy(last_args, args, sizeof(args));
 	}
@@ -1534,7 +2238,7 @@ static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
 	lp_build_intrinsic(base->gallivm->builder,
 			   "llvm.SI.export",
 			   LLVMVoidTypeInContext(base->gallivm->context),
-			   last_args, 9);
+			   last_args, 9, 0);
 }
 
 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
@@ -1563,15 +2267,36 @@ static void tex_fetch_args(
 	const struct tgsi_full_instruction * inst = emit_data->inst;
 	unsigned opcode = inst->Instruction.Opcode;
 	unsigned target = inst->Texture.Texture;
-	LLVMValueRef coords[5];
+	LLVMValueRef coords[5], derivs[6];
 	LLVMValueRef address[16];
 	int ref_pos;
 	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
 	unsigned count = 0;
 	unsigned chan;
-	unsigned sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
-	unsigned sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
+	unsigned sampler_src;
+	unsigned sampler_index;
+	unsigned num_deriv_channels = 0;
 	bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
+	LLVMValueRef res_ptr, samp_ptr;
+
+	sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
+	sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
+
+	if (emit_data->inst->Src[sampler_src].Register.Indirect) {
+		const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
+		LLVMValueRef ind_index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
+
+		res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index);
+
+		samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
+		samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index);
+	} else {
+		res_ptr = si_shader_ctx->resources[sampler_index];
+		samp_ptr = si_shader_ctx->samplers[sampler_index];
+	}
 
 	if (target == TGSI_TEXTURE_BUFFER) {
 		LLVMTypeRef i128 = LLVMIntTypeInContext(gallivm->context, 128);
@@ -1580,7 +2305,7 @@ static void tex_fetch_args(
 		LLVMTypeRef v16i8 = LLVMVectorType(i8, 16);
 
 		/* Bitcast and truncate v8i32 to v16i8. */
-		LLVMValueRef res = si_shader_ctx->resources[sampler_index];
+		LLVMValueRef res = res_ptr;
 		res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
 		res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
 		res = LLVMBuildBitCast(gallivm->builder, res, v16i8, "");
@@ -1649,18 +2374,13 @@ static void tex_fetch_args(
 		}
 	}
 
-	if (target == TGSI_TEXTURE_CUBE ||
-	    target == TGSI_TEXTURE_CUBE_ARRAY ||
-	    target == TGSI_TEXTURE_SHADOWCUBE ||
-	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
-
 	/* Pack user derivatives */
 	if (opcode == TGSI_OPCODE_TXD) {
-		int num_deriv_channels, param;
+		int param, num_src_deriv_channels;
 
 		switch (target) {
 		case TGSI_TEXTURE_3D:
+			num_src_deriv_channels = 3;
 			num_deriv_channels = 3;
 			break;
 		case TGSI_TEXTURE_2D:
@@ -1669,27 +2389,44 @@ static void tex_fetch_args(
 		case TGSI_TEXTURE_SHADOWRECT:
 		case TGSI_TEXTURE_2D_ARRAY:
 		case TGSI_TEXTURE_SHADOW2D_ARRAY:
+			num_src_deriv_channels = 2;
+			num_deriv_channels = 2;
+			break;
 		case TGSI_TEXTURE_CUBE:
 		case TGSI_TEXTURE_SHADOWCUBE:
 		case TGSI_TEXTURE_CUBE_ARRAY:
 		case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+			/* Cube derivatives will be converted to 2D. */
+			num_src_deriv_channels = 3;
 			num_deriv_channels = 2;
 			break;
 		case TGSI_TEXTURE_1D:
 		case TGSI_TEXTURE_SHADOW1D:
 		case TGSI_TEXTURE_1D_ARRAY:
 		case TGSI_TEXTURE_SHADOW1D_ARRAY:
+			num_src_deriv_channels = 1;
 			num_deriv_channels = 1;
 			break;
 		default:
 			assert(0); /* no other targets are valid here */
 		}
 
-		for (param = 1; param <= 2; param++)
-			for (chan = 0; chan < num_deriv_channels; chan++)
-				address[count++] = lp_build_emit_fetch(bld_base, inst, param, chan);
+		for (param = 0; param < 2; param++)
+			for (chan = 0; chan < num_src_deriv_channels; chan++)
+				derivs[param * num_src_deriv_channels + chan] =
+					lp_build_emit_fetch(bld_base, inst, param+1, chan);
 	}
 
+	if (target == TGSI_TEXTURE_CUBE ||
+	    target == TGSI_TEXTURE_CUBE_ARRAY ||
+	    target == TGSI_TEXTURE_SHADOWCUBE ||
+	    target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
+		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
+
+	if (opcode == TGSI_OPCODE_TXD)
+		for (int i = 0; i < num_deriv_channels * 2; i++)
+			address[count++] = derivs[i];
+
 	/* Pack texture coordinates */
 	address[count++] = coords[0];
 	if (num_coords > 1)
@@ -1806,7 +2543,7 @@ static void tex_fetch_args(
 	}
 
 	/* Resource */
-	emit_data->args[1] = si_shader_ctx->resources[sampler_index];
+	emit_data->args[1] = res_ptr;
 
 	if (opcode == TGSI_OPCODE_TXF) {
 		/* add tex offsets */
@@ -1889,7 +2626,7 @@ static void tex_fetch_args(
 			dmask = 1 << gather_comp;
 		}
 
-		emit_data->args[2] = si_shader_ctx->samplers[sampler_index];
+		emit_data->args[2] = samp_ptr;
 		emit_data->args[3] = lp_build_const_int32(gallivm, dmask);
 		emit_data->args[4] = lp_build_const_int32(gallivm, is_rect); /* unorm */
 		emit_data->args[5] = lp_build_const_int32(gallivm, 0); /* r128 */
@@ -1905,7 +2642,7 @@ static void tex_fetch_args(
 			LLVMFloatTypeInContext(gallivm->context),
 			4);
 	} else {
-		emit_data->args[2] = si_shader_ctx->samplers[sampler_index];
+		emit_data->args[2] = samp_ptr;
 		emit_data->args[3] = lp_build_const_int32(gallivm, target);
 		emit_data->arg_count = 4;
 
@@ -1940,7 +2677,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 				emit_data->inst->Texture.NumOffsets > 0 : false;
 
 	if (target == TGSI_TEXTURE_BUFFER) {
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder,
 			"llvm.SI.vs.load.input", emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
@@ -1989,7 +2726,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 			is_shadow ? ".c" : "", infix, has_offset ? ".o" : "",
 			LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
 
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder, intr_name, emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
 			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
@@ -2036,7 +2773,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
 		sprintf(intr_name, "%s.v%ui32", name,
 			LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
 
-		emit_data->output[emit_data->chan] = build_intrinsic(
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			base->gallivm->builder, intr_name, emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
 			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
@@ -2050,17 +2787,47 @@ static void txq_fetch_args(
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
 	unsigned target = inst->Texture.Texture;
+	LLVMValueRef res_ptr;
+
+	if (inst->Src[1].Register.Indirect) {
+		const struct tgsi_full_src_register *reg = &inst->Src[1];
+		LLVMValueRef ind_index;
+
+		ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
+
+		res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+		res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr,
+						   ind_index);
+	} else
+		res_ptr = si_shader_ctx->resources[inst->Src[1].Register.Index];
 
 	if (target == TGSI_TEXTURE_BUFFER) {
 		LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 		LLVMTypeRef v8i32 = LLVMVectorType(i32, 8);
 
 		/* Read the size from the buffer descriptor directly. */
-		LLVMValueRef size = si_shader_ctx->resources[inst->Src[1].Register.Index];
-		size = LLVMBuildBitCast(gallivm->builder, size, v8i32, "");
-		size = LLVMBuildExtractElement(gallivm->builder, size,
-					      lp_build_const_int32(gallivm, 6), "");
+		LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, v8i32, "");
+		LLVMValueRef size = LLVMBuildExtractElement(builder, res,
+						lp_build_const_int32(gallivm, 6), "");
+
+		if (si_shader_ctx->screen->b.chip_class >= VI) {
+			/* On VI, the descriptor contains the size in bytes,
+			 * but TXQ must return the size in elements.
+			 * The stride is always non-zero for resources using TXQ.
+			 */
+			LLVMValueRef stride =
+				LLVMBuildExtractElement(builder, res,
+							lp_build_const_int32(gallivm, 5), "");
+			stride = LLVMBuildLShr(builder, stride,
+					       lp_build_const_int32(gallivm, 16), "");
+			stride = LLVMBuildAnd(builder, stride,
+					      lp_build_const_int32(gallivm, 0x3FFF), "");
+
+			size = LLVMBuildUDiv(builder, size, stride, "");
+		}
+
 		emit_data->args[0] = size;
 		return;
 	}
@@ -2069,7 +2836,7 @@ static void txq_fetch_args(
 	emit_data->args[0] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
 
 	/* Resource */
-	emit_data->args[1] = si_shader_ctx->resources[inst->Src[1].Register.Index];
+	emit_data->args[1] = res_ptr;
 
 	/* Texture target */
 	if (target == TGSI_TEXTURE_CUBE_ARRAY ||
@@ -2116,6 +2883,35 @@ static void build_txq_intrinsic(const struct lp_build_tgsi_action * action,
 	}
 }
 
+/*
+ * SI implements derivatives using the local data store (LDS)
+ * All writes to the LDS happen in all executing threads at
+ * the same time. TID is the Thread ID for the current
+ * thread and is a value between 0 and 63, representing
+ * the thread's position in the wavefront.
+ *
+ * For the pixel shader threads are grouped into quads of four pixels.
+ * The TIDs of the pixels of a quad are:
+ *
+ *  +------+------+
+ *  |4n + 0|4n + 1|
+ *  +------+------+
+ *  |4n + 2|4n + 3|
+ *  +------+------+
+ *
+ * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
+ * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
+ * the current pixel's column, and masking with 0xfffffffe yields the TID
+ * of the left pixel of the current pixel's row.
+ *
+ * Adding 1 yields the TID of the pixel to the right of the left pixel, and
+ * adding 2 yields the TID of the pixel below the top pixel.
+ */
+/* masks for thread ID. */
+#define TID_MASK_TOP_LEFT 0xfffffffc
+#define TID_MASK_TOP      0xfffffffd
+#define TID_MASK_LEFT     0xfffffffe
+
 static void si_llvm_emit_ddxy(
 	const struct lp_build_tgsi_action * action,
 	struct lp_build_tgsi_context * bld_base,
@@ -2132,25 +2928,34 @@ static void si_llvm_emit_ddxy(
 	LLVMTypeRef i32;
 	unsigned swizzle[4];
 	unsigned c;
+	int idx;
+	unsigned mask;
 
 	i32 = LLVMInt32TypeInContext(gallivm->context);
 
 	indices[0] = bld_base->uint_bld.zero;
-	indices[1] = build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
 				     NULL, 0, LLVMReadNoneAttribute);
-	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
+	if (opcode == TGSI_OPCODE_DDX_FINE)
+		mask = TID_MASK_LEFT;
+	else if (opcode == TGSI_OPCODE_DDY_FINE)
+		mask = TID_MASK_TOP;
+	else
+		mask = TID_MASK_TOP_LEFT;
+
 	indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
-				  lp_build_const_int32(gallivm, 0xfffffffc), "");
-	load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+				  lp_build_const_int32(gallivm, mask), "");
+	load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
+	/* for DDX we want to next X pixel, DDY next Y pixel. */
+	idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
 	indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
-				  lp_build_const_int32(gallivm,
-						       opcode == TGSI_OPCODE_DDX ? 1 : 2),
-				  "");
-	load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
+				  lp_build_const_int32(gallivm, idx), "");
+	load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
 				 indices, 2, "");
 
 	for (c = 0; c < 4; ++c) {
@@ -2184,6 +2989,247 @@ static void si_llvm_emit_ddxy(
 	emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
 }
 
+/*
+ * this takes an I,J coordinate pair,
+ * and works out the X and Y derivatives.
+ * it returns DDX(I), DDX(J), DDY(I), DDY(J).
+ */
+static LLVMValueRef si_llvm_emit_ddxy_interp(
+	struct lp_build_tgsi_context *bld_base,
+	LLVMValueRef interp_ij)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	struct lp_build_context *base = &bld_base->base;
+	LLVMValueRef indices[2];
+	LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
+	LLVMValueRef tl, tr, bl, result[4];
+	LLVMTypeRef i32;
+	unsigned c;
+
+	i32 = LLVMInt32TypeInContext(gallivm->context);
+
+	indices[0] = bld_base->uint_bld.zero;
+	indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
+					NULL, 0, LLVMReadNoneAttribute);
+	store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				 indices, 2, "");
+
+	temp = LLVMBuildAnd(gallivm->builder, indices[1],
+			    lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
+
+	temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
+			     lp_build_const_int32(gallivm, TID_MASK_TOP), "");
+
+	indices[1] = temp;
+	load_ptr_x = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				  indices, 2, "");
+
+	indices[1] = temp2;
+	load_ptr_y = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				  indices, 2, "");
+
+	indices[1] = LLVMBuildAdd(gallivm->builder, temp,
+				  lp_build_const_int32(gallivm, 1), "");
+	load_ptr_ddx = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				   indices, 2, "");
+
+	indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
+				  lp_build_const_int32(gallivm, 2), "");
+	load_ptr_ddy = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
+				   indices, 2, "");
+
+	for (c = 0; c < 2; ++c) {
+		LLVMValueRef store_val;
+		LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
+
+		store_val = LLVMBuildExtractElement(gallivm->builder,
+						    interp_ij, c_ll, "");
+		LLVMBuildStore(gallivm->builder,
+			       store_val,
+			       store_ptr);
+
+		tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+
+		tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
+		tr = LLVMBuildBitCast(gallivm->builder, tr, base->elem_type, "");
+
+		result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
+
+		tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
+		tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
+
+		bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
+		bl = LLVMBuildBitCast(gallivm->builder, bl, base->elem_type, "");
+
+		result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
+	}
+
+	return lp_build_gather_values(gallivm, result, 4);
+}
+
+static void interp_fetch_args(
+	struct lp_build_tgsi_context *bld_base,
+	struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
+		/* offset is in second src, first two channels */
+		emit_data->args[0] = lp_build_emit_fetch(bld_base,
+							 emit_data->inst, 1,
+							 0);
+		emit_data->args[1] = lp_build_emit_fetch(bld_base,
+							 emit_data->inst, 1,
+							 1);
+		emit_data->arg_count = 2;
+	} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
+		LLVMValueRef sample_position;
+		LLVMValueRef sample_id;
+		LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
+
+		/* fetch sample ID, then fetch its sample position,
+		 * and place into first two channels.
+		 */
+		sample_id = lp_build_emit_fetch(bld_base,
+						emit_data->inst, 1, 0);
+		sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
+					     LLVMInt32TypeInContext(gallivm->context),
+					     "");
+		sample_position = load_sample_position(&si_shader_ctx->radeon_bld, sample_id);
+
+		emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
+							     sample_position,
+							     lp_build_const_int32(gallivm, 0), "");
+
+		emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
+		emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
+							     sample_position,
+							     lp_build_const_int32(gallivm, 1), "");
+		emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
+		emit_data->arg_count = 2;
+	}
+}
+
+static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
+				struct lp_build_tgsi_context *bld_base,
+				struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
+	struct si_shader *shader = si_shader_ctx->shader;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMValueRef interp_param;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	const char *intr_name;
+	int input_index;
+	int chan;
+	int i;
+	LLVMValueRef attr_number;
+	LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
+	LLVMValueRef params = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
+	int interp_param_idx;
+	unsigned location;
+
+	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
+	input_index = inst->Src[0].Register.Index;
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
+	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
+		location = TGSI_INTERPOLATE_LOC_CENTER;
+	else
+		location = TGSI_INTERPOLATE_LOC_CENTROID;
+
+	interp_param_idx = lookup_interp_param_index(shader->ps_input_interpolate[input_index],
+						     location);
+	if (interp_param_idx == -1)
+		return;
+	else if (interp_param_idx)
+		interp_param = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, interp_param_idx);
+	else
+		interp_param = NULL;
+
+	attr_number = lp_build_const_int32(gallivm,
+					   shader->ps_input_param_offset[input_index]);
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
+	    inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
+		LLVMValueRef ij_out[2];
+		LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
+
+		/*
+		 * take the I then J parameters, and the DDX/Y for it, and
+		 * calculate the IJ inputs for the interpolator.
+		 * temp1 = ddx * offset/sample.x + I;
+		 * interp_param.I = ddy * offset/sample.y + temp1;
+		 * temp1 = ddx * offset/sample.x + J;
+		 * interp_param.J = ddy * offset/sample.y + temp1;
+		 */
+		for (i = 0; i < 2; i++) {
+			LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
+			LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
+			LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
+								      ddxy_out, ix_ll, "");
+			LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
+								      ddxy_out, iy_ll, "");
+			LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
+									 interp_param, ix_ll, "");
+			LLVMValueRef temp1, temp2;
+
+			interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
+						     LLVMFloatTypeInContext(gallivm->context), "");
+
+			temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
+
+			temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
+
+			temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
+
+			temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
+
+			ij_out[i] = LLVMBuildBitCast(gallivm->builder,
+						     temp2,
+						     LLVMIntTypeInContext(gallivm->context, 32), "");
+		}
+		interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
+	}
+
+	intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
+	for (chan = 0; chan < 2; chan++) {
+		LLVMValueRef args[4];
+		LLVMValueRef llvm_chan;
+		unsigned schan;
+
+		schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
+		llvm_chan = lp_build_const_int32(gallivm, schan);
+
+		args[0] = llvm_chan;
+		args[1] = attr_number;
+		args[2] = params;
+		args[3] = interp_param;
+
+		emit_data->output[chan] =
+			lp_build_intrinsic(gallivm->builder, intr_name,
+					   input_type, args, args[3] ? 4 : 3,
+					   LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
+	}
+}
+
+static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
+				       struct lp_build_emit_data *emit_data)
+{
+	LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
+	struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
+	unsigned stream;
+
+	assert(src0.File == TGSI_FILE_IMMEDIATE);
+
+	stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
+	return stream;
+}
+
 /* Emit one vertex from the geometry shader */
 static void si_llvm_emit_vertex(
 	const struct lp_build_tgsi_action *action,
@@ -2203,9 +3249,14 @@ static void si_llvm_emit_vertex(
 	LLVMValueRef args[2];
 	unsigned chan;
 	int i;
+	unsigned stream;
+
+	stream = si_llvm_get_stream(bld_base, emit_data);
 
 	/* Write vertex attribute values to GSVS ring */
-	gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, "");
+	gs_next_vertex = LLVMBuildLoad(gallivm->builder,
+				       si_shader_ctx->gs_next_vertex[stream],
+				       "");
 
 	/* If this thread has already emitted the declared maximum number of
 	 * vertices, kill it: excessive vertex emissions are not supposed to
@@ -2218,8 +3269,9 @@ static void si_llvm_emit_vertex(
 	kill = lp_build_select(&bld_base->base, can_emit,
 			       lp_build_const_float(gallivm, 1.0f),
 			       lp_build_const_float(gallivm, -1.0f));
-	build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
-			LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
+
+	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
+			   LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
 
 	for (i = 0; i < info->num_outputs; i++) {
 		LLVMValueRef *out_ptr =
@@ -2237,7 +3289,7 @@ static void si_llvm_emit_vertex(
 			out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
 
 			build_tbuffer_store(si_shader_ctx,
-					    si_shader_ctx->gsvs_ring,
+					    si_shader_ctx->gsvs_ring[stream],
 					    out_val, 1,
 					    voffset, soffset, 0,
 					    V_008F0C_BUF_DATA_FORMAT_32,
@@ -2247,12 +3299,13 @@ static void si_llvm_emit_vertex(
 	}
 	gs_next_vertex = lp_build_add(uint, gs_next_vertex,
 				      lp_build_const_int32(gallivm, 1));
-	LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex);
+
+	LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex[stream]);
 
 	/* Signal vertex emission */
-	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS);
+	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
@@ -2266,15 +3319,28 @@ static void si_llvm_emit_primitive(
 	struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMValueRef args[2];
+	unsigned stream;
 
 	/* Signal primitive cut */
-	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_CUT | SENDMSG_GS);
+	stream = si_llvm_get_stream(bld_base, emit_data);
+	args[0] = lp_build_const_int32(gallivm,	SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
 	args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
-	build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
+	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
 			LLVMVoidTypeInContext(gallivm->context), args, 2,
 			LLVMNoUnwindAttribute);
 }
 
+static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
+				 struct lp_build_tgsi_context *bld_base,
+				 struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+
+	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
+			LLVMVoidTypeInContext(gallivm->context), NULL, 0,
+			LLVMNoUnwindAttribute);
+}
+
 static const struct lp_build_tgsi_action tex_action = {
 	.fetch_args = tex_fetch_args,
 	.emit = build_tex_intrinsic,
@@ -2286,6 +3352,11 @@ static const struct lp_build_tgsi_action txq_action = {
 	.intr_name = "llvm.SI.resinfo"
 };
 
+static const struct lp_build_tgsi_action interp_action = {
+	.fetch_args = interp_fetch_args,
+	.emit = build_interp_intrinsic,
+};
+
 static void create_meta_data(struct si_shader_context *si_shader_ctx)
 {
 	struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
@@ -2304,6 +3375,27 @@ static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
 			       CONST_ADDR_SPACE);
 }
 
+static void declare_streamout_params(struct si_shader_context *si_shader_ctx,
+				     struct pipe_stream_output_info *so,
+				     LLVMTypeRef *params, LLVMTypeRef i32,
+				     unsigned *num_params)
+{
+	int i;
+
+	/* Streamout SGPRs. */
+	if (so->num_outputs) {
+		params[si_shader_ctx->param_streamout_config = (*num_params)++] = i32;
+		params[si_shader_ctx->param_streamout_write_index = (*num_params)++] = i32;
+	}
+	/* A streamout buffer offset is loaded if the stride is non-zero. */
+	for (i = 0; i < 4; i++) {
+		if (!so->stride[i])
+			continue;
+
+		params[si_shader_ctx->param_streamout_offset[i] = (*num_params)++] = i32;
+	}
+}
+
 static void create_function(struct si_shader_context *si_shader_ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
@@ -2336,8 +3428,10 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 		num_params = SI_PARAM_START_INSTANCE+1;
 
 		if (shader->key.vs.as_es) {
-			params[SI_PARAM_ES2GS_OFFSET] = i32;
-			num_params++;
+			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
+		} else if (shader->key.vs.as_ls) {
+			params[SI_PARAM_LS_OUT_LAYOUT] = i32;
+			num_params = SI_PARAM_LS_OUT_LAYOUT+1;
 		} else {
 			if (shader->is_gs_copy_shader) {
 				last_array_pointer = SI_PARAM_CONST;
@@ -2345,30 +3439,52 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 			}
 
 			/* The locations of the other parameters are assigned dynamically. */
-
-			/* Streamout SGPRs. */
-			if (shader->selector->so.num_outputs) {
-				params[si_shader_ctx->param_streamout_config = num_params++] = i32;
-				params[si_shader_ctx->param_streamout_write_index = num_params++] = i32;
-			}
-			/* A streamout buffer offset is loaded if the stride is non-zero. */
-			for (i = 0; i < 4; i++) {
-				if (!shader->selector->so.stride[i])
-					continue;
-
-				params[si_shader_ctx->param_streamout_offset[i] = num_params++] = i32;
-			}
+			declare_streamout_params(si_shader_ctx, &shader->selector->so,
+						 params, i32, &num_params);
 		}
 
 		last_sgpr = num_params-1;
 
 		/* VGPRs */
 		params[si_shader_ctx->param_vertex_id = num_params++] = i32;
-		params[num_params++] = i32; /* unused*/
-		params[num_params++] = i32; /* unused */
+		params[si_shader_ctx->param_rel_auto_id = num_params++] = i32;
+		params[si_shader_ctx->param_vs_prim_id = num_params++] = i32;
 		params[si_shader_ctx->param_instance_id = num_params++] = i32;
 		break;
 
+	case TGSI_PROCESSOR_TESS_CTRL:
+		params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
+		params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
+		params[SI_PARAM_TCS_IN_LAYOUT] = i32;
+		params[SI_PARAM_TESS_FACTOR_OFFSET] = i32;
+		last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
+
+		/* VGPRs */
+		params[SI_PARAM_PATCH_ID] = i32;
+		params[SI_PARAM_REL_IDS] = i32;
+		num_params = SI_PARAM_REL_IDS+1;
+		break;
+
+	case TGSI_PROCESSOR_TESS_EVAL:
+		params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
+		params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
+		num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
+
+		if (shader->key.tes.as_es) {
+			params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
+		} else {
+			declare_streamout_params(si_shader_ctx, &shader->selector->so,
+						 params, i32, &num_params);
+		}
+		last_sgpr = num_params - 1;
+
+		/* VGPRs */
+		params[si_shader_ctx->param_tes_u = num_params++] = f32;
+		params[si_shader_ctx->param_tes_v = num_params++] = f32;
+		params[si_shader_ctx->param_tes_rel_patch_id = num_params++] = i32;
+		params[si_shader_ctx->param_tes_patch_id = num_params++] = i32;
+		break;
+
 	case TGSI_PROCESSOR_GEOMETRY:
 		params[SI_PARAM_GS2VS_OFFSET] = i32;
 		params[SI_PARAM_GS_WAVE_ID] = i32;
@@ -2435,12 +3551,35 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 
 	if (bld_base->info &&
 	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
-	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0))
-		si_shader_ctx->ddxy_lds =
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
+	     bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
+		si_shader_ctx->lds =
 			LLVMAddGlobalInAddressSpace(gallivm->module,
 						    LLVMArrayType(i32, 64),
 						    "ddxy_lds",
 						    LOCAL_ADDR_SPACE);
+
+	if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
+	    si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
+	    si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
+		/* This is the upper bound, maximum is 32 inputs times 32 vertices */
+		unsigned vertex_data_dw_size = 32*32*4;
+		unsigned patch_data_dw_size = 32*4;
+		/* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
+		unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
+		unsigned lds_dwords = patch_dw_size;
+
+		/* The actual size is computed outside of the shader to reduce
+		 * the number of shader variants. */
+		si_shader_ctx->lds =
+			LLVMAddGlobalInAddressSpace(gallivm->module,
+						    LLVMArrayType(i32, lds_dwords),
+						    "tess_lds",
+						    LOCAL_ADDR_SPACE);
+	}
 }
 
 static void preload_constants(struct si_shader_context *si_shader_ctx)
@@ -2517,9 +3656,13 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	unsigned i;
 
-	if (si_shader_ctx->type != TGSI_PROCESSOR_VERTEX ||
-	    si_shader_ctx->shader->key.vs.as_es ||
-	    !si_shader_ctx->shader->selector->so.num_outputs)
+	/* Streamout can only be used if the shader is compiled as VS. */
+	if (!si_shader_ctx->shader->selector->so.num_outputs ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
+	     (si_shader_ctx->shader->key.vs.as_es ||
+	      si_shader_ctx->shader->key.vs.as_ls)) ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
+	     si_shader_ctx->shader->key.tes.as_es))
 		return;
 
 	LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
@@ -2550,6 +3693,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 
 	if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
 	     si_shader_ctx->shader->key.vs.as_es) ||
+	    (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
+	     si_shader_ctx->shader->key.tes.as_es) ||
 	    si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS);
 
@@ -2557,13 +3702,21 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
 	}
 
-	if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY ||
-	    si_shader_ctx->shader->is_gs_copy_shader) {
+	if (si_shader_ctx->shader->is_gs_copy_shader) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
 
-		si_shader_ctx->gsvs_ring =
+		si_shader_ctx->gsvs_ring[0] =
 			build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
 	}
+	if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+		int i;
+		for (i = 0; i < 4; i++) {
+			LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i);
+
+			si_shader_ctx->gsvs_ring[i] =
+				build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
+		}
+	}
 }
 
 void si_shader_binary_read_config(const struct si_screen *sscreen,
@@ -2637,26 +3790,54 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 	}
 }
 
-int si_shader_binary_read(struct si_screen *sscreen,
-			struct si_shader *shader,
-			const struct radeon_shader_binary *binary)
+int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 {
-
-	unsigned i;
-	unsigned code_size;
+	const struct radeon_shader_binary *binary = &shader->binary;
+	unsigned code_size = binary->code_size + binary->rodata_size;
 	unsigned char *ptr;
+
+	r600_resource_reference(&shader->bo, NULL);
+	shader->bo = si_resource_create_custom(&sscreen->b.b,
+					       PIPE_USAGE_IMMUTABLE,
+					       code_size);
+	if (!shader->bo)
+		return -ENOMEM;
+
+	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
+					PIPE_TRANSFER_READ_WRITE);
+	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
+	if (binary->rodata_size > 0) {
+		ptr += binary->code_size;
+		util_memcpy_cpu_to_le32(ptr, binary->rodata,
+					binary->rodata_size);
+	}
+
+	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
+	return 0;
+}
+
+int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
+{
+	const struct radeon_shader_binary *binary = &shader->binary;
+	unsigned i;
 	bool dump  = r600_can_dump_shader(&sscreen->b,
 		shader->selector ? shader->selector->tokens : NULL);
 
 	si_shader_binary_read_config(sscreen, shader, 0);
+	si_shader_binary_upload(sscreen, shader);
 
 	if (dump) {
-		if (!binary->disassembled) {
-			fprintf(stderr, "SI CODE:\n");
-			for (i = 0; i < binary->code_size; i+=4 ) {
-				fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
-				binary->code[i + 2], binary->code[i + 1],
-				binary->code[i]);
+		if (!(sscreen->b.debug_flags & DBG_NO_ASM)) {
+			if (binary->disasm_string) {
+				fprintf(stderr, "\nShader Disassembly:\n\n");
+				fprintf(stderr, "%s\n", binary->disasm_string);
+			} else {
+				fprintf(stderr, "SI CODE:\n");
+				for (i = 0; i < binary->code_size; i+=4 ) {
+					fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
+					binary->code[i + 2], binary->code[i + 1],
+					binary->code[i]);
+				}
 			}
 		}
 
@@ -2666,26 +3847,6 @@ int si_shader_binary_read(struct si_screen *sscreen,
 			shader->num_sgprs, shader->num_vgprs, binary->code_size,
 			shader->lds_size, shader->scratch_bytes_per_wave);
 	}
-
-	/* copy new shader */
-	code_size = binary->code_size + binary->rodata_size;
-	r600_resource_reference(&shader->bo, NULL);
-	shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE,
-					       code_size);
-	if (shader->bo == NULL) {
-		return -ENOMEM;
-	}
-
-
-	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_READ_WRITE);
-	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
-	if (binary->rodata_size > 0) {
-		ptr += binary->code_size;
-		util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
-	}
-
-	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
-
 	return 0;
 }
 
@@ -2693,15 +3854,16 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod)
 {
 	int r = 0;
-	bool dump = r600_can_dump_shader(&sscreen->b,
-			shader->selector ? shader->selector->tokens : NULL);
-	r = radeon_llvm_compile(mod, &shader->binary,
-		r600_get_llvm_processor_name(sscreen->b.family), dump, tm);
+	bool dump_asm = r600_can_dump_shader(&sscreen->b,
+				shader->selector ? shader->selector->tokens : NULL);
+	bool dump_ir = dump_asm && !(sscreen->b.debug_flags & DBG_NO_IR);
 
-	if (r) {
+	r = radeon_llvm_compile(mod, &shader->binary,
+		r600_get_llvm_processor_name(sscreen->b.family), dump_ir, dump_asm, tm);
+	if (r)
 		return r;
-	}
-	r = si_shader_binary_read(sscreen, shader, &shader->binary);
+
+	r = si_shader_binary_read(sscreen, shader);
 
 	FREE(shader->binary.config);
 	FREE(shader->binary.rodata);
@@ -2709,7 +3871,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 	if (shader->scratch_bytes_per_wave == 0) {
 		FREE(shader->binary.code);
 		FREE(shader->binary.relocs);
-		memset(&shader->binary, 0, sizeof(shader->binary));
+		memset(&shader->binary, 0,
+		       offsetof(struct radeon_shader_binary, disasm_string));
 	}
 	return r;
 }
@@ -2741,7 +3904,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 	preload_streamout_buffers(si_shader_ctx);
 	preload_ring_buffers(si_shader_ctx);
 
-	args[0] = si_shader_ctx->gsvs_ring;
+	args[0] = si_shader_ctx->gsvs_ring[0];
 	args[1] = lp_build_mul_imm(uint,
 				   LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 						si_shader_ctx->param_vertex_id),
@@ -2767,7 +3930,7 @@ static int si_generate_gs_copy_shader(struct si_screen *sscreen,
 
 			outputs[i].values[chan] =
 				LLVMBuildBitCast(gallivm->builder,
-						 build_intrinsic(gallivm->builder,
+						 lp_build_intrinsic(gallivm->builder,
 								 "llvm.SI.buffer.load.dword.i32.i32",
 								 LLVMInt32TypeInContext(gallivm->context),
 								 args, 9,
@@ -2807,9 +3970,21 @@ static void si_dump_key(unsigned shader, union si_shader_key *key)
 		fprintf(stderr, "}\n");
 
 		if (key->vs.as_es)
-			fprintf(stderr, "  gs_used_inputs = 0x%"PRIx64"\n",
-				key->vs.gs_used_inputs);
+			fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
+				key->vs.es_enabled_outputs);
 		fprintf(stderr, "  as_es = %u\n", key->vs.as_es);
+		fprintf(stderr, "  as_es = %u\n", key->vs.as_ls);
+		break;
+
+	case PIPE_SHADER_TESS_CTRL:
+		fprintf(stderr, "  prim_mode = %u\n", key->tcs.prim_mode);
+		break;
+
+	case PIPE_SHADER_TESS_EVAL:
+		if (key->tes.as_es)
+			fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
+				key->tes.es_enabled_outputs);
+		fprintf(stderr, "  as_es = %u\n", key->tes.as_es);
 		break;
 
 	case PIPE_SHADER_GEOMETRY:
@@ -2851,7 +4026,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 	/* Dump TGSI code before doing TGSI->LLVM conversion in case the
 	 * conversion fails. */
-	if (dump) {
+	if (dump && !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
 		si_dump_key(sel->type, &shader->key);
 		tgsi_dump(tokens, 0);
 		si_dump_streamout(&sel->so);
@@ -2873,6 +4048,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
 
+	bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
+	bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
+	bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
+
 	bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
@@ -2888,9 +4067,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
+	bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
+	bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
 
 	bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
 	bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
+	bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
 
 	if (HAVE_LLVM >= 0x0306) {
 		bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
@@ -2908,11 +4090,25 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	switch (si_shader_ctx.type) {
 	case TGSI_PROCESSOR_VERTEX:
 		si_shader_ctx.radeon_bld.load_input = declare_input_vs;
-		if (shader->key.vs.as_es) {
+		if (shader->key.vs.as_ls)
+			bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
+		else if (shader->key.vs.as_es)
 			bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
-		} else {
+		else
+			bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
+		break;
+	case TGSI_PROCESSOR_TESS_CTRL:
+		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
+		bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
+		bld_base->emit_store = store_output_tcs;
+		bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
+		break;
+	case TGSI_PROCESSOR_TESS_EVAL:
+		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
+		if (shader->key.tes.as_es)
+			bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
+		else
 			bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
-		}
 		break;
 	case TGSI_PROCESSOR_GEOMETRY:
 		bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
@@ -2946,9 +4142,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	preload_ring_buffers(&si_shader_ctx);
 
 	if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
-		si_shader_ctx.gs_next_vertex =
-			lp_build_alloca(bld_base->base.gallivm,
-					bld_base->uint_bld.elem_type, "");
+		int i;
+		for (i = 0; i < 4; i++) {
+			si_shader_ctx.gs_next_vertex[i] =
+				lp_build_alloca(bld_base->base.gallivm,
+						bld_base->uint_bld.elem_type, "");
+		}
 	}
 
 	if (!lp_build_tgsi_llvm(bld_base, tokens)) {
@@ -3000,4 +4199,5 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
 
 	FREE(shader->binary.code);
 	FREE(shader->binary.relocs);
+	FREE(shader->binary.disasm_string);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 51055afe36a..cd845c12e64 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -26,6 +26,46 @@
  *      Christian König <christian.koenig@amd.com>
  */
 
+/* How linking tessellation shader inputs and outputs works.
+ *
+ * Inputs and outputs between shaders are stored in a buffer. This buffer
+ * lives in LDS (typical case for tessellation), but it can also live
+ * in memory. Each input or output has a fixed location within a vertex.
+ * The highest used input or output determines the stride between vertices.
+ *
+ * Since tessellation is only enabled in the OpenGL core profile,
+ * only these semantics are valid for per-vertex data:
+ *
+ *   Name             Location
+ *
+ *   POSITION         0
+ *   PSIZE            1
+ *   CLIPDIST0..1     2..3
+ *   CULLDIST0..1     (not implemented)
+ *   GENERIC0..31     4..35
+ *
+ * For example, a shader only writing GENERIC0 has the output stride of 5.
+ *
+ * Only these semantics are valid for per-patch data:
+ *
+ *   Name             Location
+ *
+ *   TESSOUTER        0
+ *   TESSINNER        1
+ *   PATCH0..29       2..31
+ *
+ * That's how independent shaders agree on input and output locations.
+ * The si_shader_io_get_unique_index function assigns the locations.
+ *
+ * Other required information for calculating the input and output addresses
+ * like the vertex stride, the patch stride, and the offsets where per-vertex
+ * and per-patch data start, is passed to the shader via user data SGPRs.
+ * The offsets and strides are calculated at draw time and aren't available
+ * at compile time.
+ *
+ * The same approach should be used for linking ES->GS in the future.
+ */
+
 #ifndef SI_SHADER_H
 #define SI_SHADER_H
 
@@ -43,9 +83,16 @@ struct radeon_shader_reloc;
 #define SI_SGPR_VERTEX_BUFFER	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
+#define SI_SGPR_LS_OUT_LAYOUT	12 /* VS(LS) only */
+#define SI_SGPR_TCS_OUT_OFFSETS	8  /* TCS & TES only */
+#define SI_SGPR_TCS_OUT_LAYOUT	9  /* TCS & TES only */
+#define SI_SGPR_TCS_IN_LAYOUT	10 /* TCS only */
 #define SI_SGPR_ALPHA_REF	8  /* PS only */
 
 #define SI_VS_NUM_USER_SGPR	12
+#define SI_LS_NUM_USER_SGPR	13
+#define SI_TCS_NUM_USER_SGPR	11
+#define SI_TES_NUM_USER_SGPR	10
 #define SI_GS_NUM_USER_SGPR	8
 #define SI_GSCOPY_NUM_USER_SGPR	4
 #define SI_PS_NUM_USER_SGPR	9
@@ -62,8 +109,30 @@ struct radeon_shader_reloc;
 #define SI_PARAM_START_INSTANCE	6
 /* the other VS parameters are assigned dynamically */
 
-/* ES only parameters */
-#define SI_PARAM_ES2GS_OFFSET	7
+/* Offsets where TCS outputs and TCS patch outputs live in LDS:
+ *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
+ *   [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32
+ */
+#define SI_PARAM_TCS_OUT_OFFSETS 4 /* for TCS & TES */
+
+/* Layout of TCS outputs / TES inputs:
+ *   [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4
+ *   [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4
+ *   [26:31] = gl_PatchVerticesIn, max = 32
+ */
+#define SI_PARAM_TCS_OUT_LAYOUT	5 /* for TCS & TES */
+
+/* Layout of LS outputs / TCS inputs
+ *   [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4
+ *   [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4
+ */
+#define SI_PARAM_TCS_IN_LAYOUT	6 /* TCS only */
+#define SI_PARAM_LS_OUT_LAYOUT	7 /* same value as TCS_IN_LAYOUT, LS only */
+
+/* TCS only parameters. */
+#define SI_PARAM_TESS_FACTOR_OFFSET 7
+#define SI_PARAM_PATCH_ID	8
+#define SI_PARAM_REL_IDS	9
 
 /* GS only parameters */
 #define SI_PARAM_GS2VS_OFFSET	4
@@ -115,9 +184,25 @@ struct si_shader_selector {
 
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
-	uint64_t	gs_used_inputs; /* mask of "get_unique_index" bits */
+	unsigned	gs_num_invocations;
+
+	/* masks of "get_unique_index" bits */
+	uint64_t	inputs_read;
+	uint64_t	outputs_written;
+	uint32_t	patch_outputs_written;
 };
 
+/* Valid shader configurations:
+ *
+ * API shaders       VS | TCS | TES | GS |pass| PS
+ * are compiled as:     |     |     |    |thru|
+ *                      |     |     |    |    |
+ * Only VS & PS:     VS | --  | --  | -- | -- | PS
+ * With GS:          ES | --  | --  | GS | VS | PS
+ * With Tessel.:     LS | HS  | VS  | -- | -- | PS
+ * With both:        LS | HS  | ES  | GS | VS | PS
+ */
+
 union si_shader_key {
 	struct {
 		unsigned	export_16bpc:8;
@@ -130,11 +215,25 @@ union si_shader_key {
 	} ps;
 	struct {
 		unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
-		/* The mask of "get_unique_index" bits, needed for ES,
-		 * it describes how the ES->GS ring buffer is laid out. */
-		uint64_t	gs_used_inputs;
-		unsigned	as_es:1;
+		/* Mask of "get_unique_index" bits - which outputs are read
+		 * by the next stage (needed by ES).
+		 * This describes how outputs are laid out in memory. */
+		uint64_t	es_enabled_outputs;
+		unsigned	as_es:1; /* export shader */
+		unsigned	as_ls:1; /* local shader */
+		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
 	} vs;
+	struct {
+		unsigned	prim_mode:3;
+	} tcs; /* tessellation control shader */
+	struct {
+		/* Mask of "get_unique_index" bits - which outputs are read
+		 * by the next stage (needed by ES).
+		 * This describes how outputs are laid out in memory. */
+		uint64_t	es_enabled_outputs;
+		unsigned	as_es:1; /* export shader */
+		unsigned	export_prim_id; /* when PS needs it and GS is disabled */
+	} tes; /* tessellation evaluation shader */
 };
 
 struct si_shader {
@@ -161,27 +260,47 @@ struct si_shader {
 	unsigned		nparam;
 	unsigned		vs_output_param_offset[PIPE_MAX_SHADER_OUTPUTS];
 	unsigned		ps_input_param_offset[PIPE_MAX_SHADER_INPUTS];
-
+	unsigned		ps_input_interpolate[PIPE_MAX_SHADER_INPUTS];
 	bool			uses_instanceid;
 	unsigned		nr_pos_exports;
+	unsigned		nr_param_exports;
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
+
+	unsigned		ls_rsrc1;
+	unsigned		ls_rsrc2;
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
 {
-	return sctx->gs_shader ? &sctx->gs_shader->info
-                               : &sctx->vs_shader->info;
+	if (sctx->gs_shader)
+		return &sctx->gs_shader->info;
+	else if (sctx->tes_shader)
+		return &sctx->tes_shader->info;
+	else
+		return &sctx->vs_shader->info;
 }
 
 static inline struct si_shader* si_get_vs_state(struct si_context *sctx)
 {
 	if (sctx->gs_shader)
 		return sctx->gs_shader->current->gs_copy_shader;
+	else if (sctx->tes_shader)
+		return sctx->tes_shader->current;
 	else
 		return sctx->vs_shader->current;
 }
 
+static inline bool si_vs_exports_prim_id(struct si_shader *shader)
+{
+	if (shader->selector->type == PIPE_SHADER_VERTEX)
+		return shader->key.vs.export_prim_id;
+	else if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		return shader->key.tes.export_prim_id;
+	else
+		return false;
+}
+
 /* radeonsi_shader.c */
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		     struct si_shader *shader);
@@ -189,8 +308,8 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMTargetMachineRef tm, LLVMModuleRef mod);
 void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
-int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
-		const struct radeon_shader_binary *binary);
+int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
+int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			uint64_t scratch_va);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 6c18836d189..c923ea7e154 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -61,7 +61,7 @@ unsigned si_array_mode(unsigned mode)
 
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex)
 {
-	if (sscreen->b.chip_class == CIK &&
+	if (sscreen->b.chip_class >= CIK &&
 	    sscreen->b.info.cik_macrotile_mode_array_valid) {
 		unsigned index, tileb;
 
@@ -489,11 +489,14 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 		S_02881C_USE_VTX_POINT_SIZE(info->writes_psize) |
 		S_02881C_USE_VTX_EDGE_FLAG(info->writes_edgeflag) |
 		S_02881C_USE_VTX_RENDER_TARGET_INDX(info->writes_layer) |
+	        S_02881C_USE_VTX_VIEWPORT_INDX(info->writes_viewport_index) |
 		S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipdist_mask & 0x0F) != 0) |
 		S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipdist_mask & 0xF0) != 0) |
 		S_02881C_VS_OUT_MISC_VEC_ENA(info->writes_psize ||
 					    info->writes_edgeflag ||
-					    info->writes_layer) |
+					    info->writes_layer ||
+					     info->writes_viewport_index) |
+		S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1) |
 		(sctx->queued.named.rasterizer->clip_plane_enable &
 		 clipdist_mask));
 	r600_write_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
@@ -509,20 +512,26 @@ static void si_set_scissor_states(struct pipe_context *ctx,
                                   const struct pipe_scissor_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_scissor *scissor = CALLOC_STRUCT(si_state_scissor);
-	struct si_pm4_state *pm4 = &scissor->pm4;
+	struct si_state_scissor *scissor;
+	struct si_pm4_state *pm4;
+	int i;
 
-	if (scissor == NULL)
-		return;
+	for (i = start_slot; i < start_slot + num_scissors; i++) {
+		int idx = i - start_slot;
+		int offset = i * 4 * 2;
 
-	scissor->scissor = *state;
-	si_pm4_set_reg(pm4, R_028250_PA_SC_VPORT_SCISSOR_0_TL,
-		       S_028250_TL_X(state->minx) | S_028250_TL_Y(state->miny) |
-		       S_028250_WINDOW_OFFSET_DISABLE(1));
-	si_pm4_set_reg(pm4, R_028254_PA_SC_VPORT_SCISSOR_0_BR,
-		       S_028254_BR_X(state->maxx) | S_028254_BR_Y(state->maxy));
-
-	si_pm4_set_state(sctx, scissor, scissor);
+		scissor = CALLOC_STRUCT(si_state_scissor);
+		if (scissor == NULL)
+			return;
+		pm4 = &scissor->pm4;
+		scissor->scissor = state[idx];
+		si_pm4_set_reg(pm4, R_028250_PA_SC_VPORT_SCISSOR_0_TL + offset,
+			       S_028250_TL_X(state[idx].minx) | S_028250_TL_Y(state[idx].miny) |
+			       S_028250_WINDOW_OFFSET_DISABLE(1));
+		si_pm4_set_reg(pm4, R_028254_PA_SC_VPORT_SCISSOR_0_BR + offset,
+			       S_028254_BR_X(state[idx].maxx) | S_028254_BR_Y(state[idx].maxy));
+		si_pm4_set_state(sctx, scissor[i], scissor);
+	}
 }
 
 static void si_set_viewport_states(struct pipe_context *ctx,
@@ -531,21 +540,29 @@ static void si_set_viewport_states(struct pipe_context *ctx,
                                    const struct pipe_viewport_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct si_state_viewport *viewport = CALLOC_STRUCT(si_state_viewport);
-	struct si_pm4_state *pm4 = &viewport->pm4;
+	struct si_state_viewport *viewport;
+	struct si_pm4_state *pm4;
+	int i;
 
-	if (viewport == NULL)
-		return;
+	for (i = start_slot; i < start_slot + num_viewports; i++) {
+		int idx = i - start_slot;
+		int offset = i * 4 * 6;
 
-	viewport->viewport = *state;
-	si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE_0, fui(state->scale[0]));
-	si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET_0, fui(state->translate[0]));
-	si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE_0, fui(state->scale[1]));
-	si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET_0, fui(state->translate[1]));
-	si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE_0, fui(state->scale[2]));
-	si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET_0, fui(state->translate[2]));
+		viewport = CALLOC_STRUCT(si_state_viewport);
+		if (!viewport)
+			return;
+		pm4 = &viewport->pm4;
 
-	si_pm4_set_state(sctx, viewport, viewport);
+		viewport->viewport = state[idx];
+		si_pm4_set_reg(pm4, R_02843C_PA_CL_VPORT_XSCALE + offset, fui(state[idx].scale[0]));
+		si_pm4_set_reg(pm4, R_028440_PA_CL_VPORT_XOFFSET + offset, fui(state[idx].translate[0]));
+		si_pm4_set_reg(pm4, R_028444_PA_CL_VPORT_YSCALE + offset, fui(state[idx].scale[1]));
+		si_pm4_set_reg(pm4, R_028448_PA_CL_VPORT_YOFFSET + offset, fui(state[idx].translate[1]));
+		si_pm4_set_reg(pm4, R_02844C_PA_CL_VPORT_ZSCALE + offset, fui(state[idx].scale[2]));
+		si_pm4_set_reg(pm4, R_028450_PA_CL_VPORT_ZOFFSET + offset, fui(state[idx].translate[2]));
+
+		si_pm4_set_state(sctx, viewport[i], viewport);
+	}
 }
 
 /*
@@ -649,7 +666,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 
 	/* offset */
 	rs->offset_units = state->offset_units;
-	rs->offset_scale = state->offset_scale * 12.0f;
+	rs->offset_scale = state->offset_scale * 16.0f;
 
 	si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
 		S_0286D4_FLAT_SHADE_ENA(1) |
@@ -718,12 +735,12 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 
 	if (sctx->framebuffer.nr_samples > 1 &&
 	    (!old_rs || old_rs->multisample_enable != rs->multisample_enable))
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	si_pm4_bind_state(sctx, rasterizer, rs);
 	si_update_fb_rs_state(sctx);
 
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 }
 
 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
@@ -821,7 +838,8 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
 
 	db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
 		S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
-		S_028800_ZFUNC(state->depth.func);
+		S_028800_ZFUNC(state->depth.func) |
+		S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
 
 	/* stencil */
 	if (state->stencil[0].enabled) {
@@ -850,9 +868,12 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
 		dsa->alpha_func = PIPE_FUNC_ALWAYS;
 	}
 
-	/* misc */
 	si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
 	si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
+	if (state->depth.bounds_test) {
+		si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
+		si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
+	}
 
 	return dsa;
 }
@@ -888,7 +909,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 
-	sctx->db_render_state.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
 static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
@@ -1157,7 +1178,9 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 				       int first_non_void)
 {
 	struct si_screen *sscreen = (struct si_screen*)screen;
-	bool enable_s3tc = sscreen->b.info.drm_minor >= 31;
+	bool enable_compressed_formats = (sscreen->b.info.drm_major == 2 &&
+					  sscreen->b.info.drm_minor >= 31) ||
+					 sscreen->b.info.drm_major == 3;
 	boolean uniform = TRUE;
 	int i;
 
@@ -1200,7 +1223,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		switch (format) {
@@ -1220,7 +1243,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		switch (format) {
@@ -1249,8 +1272,7 @@ static uint32_t si_translate_texformat(struct pipe_screen *screen,
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-
-		if (!enable_s3tc)
+		if (!enable_compressed_formats)
 			goto out_unknown;
 
 		if (!util_format_s3tc_enabled) {
@@ -1606,7 +1628,6 @@ boolean si_is_format_supported(struct pipe_screen *screen,
                                unsigned sample_count,
                                unsigned usage)
 {
-	struct si_screen *sscreen = (struct si_screen *)screen;
 	unsigned retval = 0;
 
 	if (target >= PIPE_MAX_TEXTURE_TYPES) {
@@ -1618,8 +1639,7 @@ boolean si_is_format_supported(struct pipe_screen *screen,
 		return FALSE;
 
 	if (sample_count > 1) {
-		/* 2D tiling on CIK is supported since DRM 2.35.0 */
-		if (sscreen->b.chip_class >= CIK && sscreen->b.info.drm_minor < 35)
+		if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
 			return FALSE;
 
 		switch (sample_count) {
@@ -1826,6 +1846,9 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	surf->cb_color_info = color_info;
 	surf->cb_color_attrib = color_attrib;
 
+	if (sctx->b.chip_class >= VI)
+		surf->cb_dcc_control = S_028C78_OVERWRITE_COMBINER_DISABLE(1);
+
 	if (rtex->fmask.size) {
 		surf->cb_color_fmask = (offset + rtex->fmask.offset) >> 8;
 		surf->cb_color_fmask_slice = S_028C88_TILE_MAX(rtex->fmask.slice_tile_max);
@@ -2023,7 +2046,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 				  util_format_is_pure_integer(state->cbufs[0]->format);
 
 	if (sctx->framebuffer.cb0_is_integer != old_cb0_is_integer)
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 	for (i = 0; i < state->nr_cbufs; i++) {
 		if (!state->cbufs[i])
@@ -2043,6 +2066,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		if (rtex->fmask.size && rtex->cmask.size) {
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 		}
+		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
 	/* Set the 16BPC export for possible dual-src blending. */
 	if (i == 1 && surf && surf->export_16bpc) {
@@ -2057,20 +2081,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		if (!surf->depth_initialized) {
 			si_init_depth_surface(sctx, surf);
 		}
+		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
 
 	si_update_fb_rs_state(sctx);
 	si_update_fb_blend_state(sctx);
 
-	sctx->framebuffer.atom.num_dw = state->nr_cbufs*15 + (8 - state->nr_cbufs)*3;
+	sctx->framebuffer.atom.num_dw = state->nr_cbufs*16 + (8 - state->nr_cbufs)*3;
 	sctx->framebuffer.atom.num_dw += state->zsbuf ? 26 : 4;
 	sctx->framebuffer.atom.num_dw += 3; /* WINDOW_SCISSOR_BR */
 	sctx->framebuffer.atom.num_dw += 18; /* MSAA sample locations */
-	sctx->framebuffer.atom.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
 
 	if (sctx->framebuffer.nr_samples != old_nr_samples) {
-		sctx->msaa_config.dirty = true;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 		/* Set sample locations as fragment shader constants. */
 		switch (sctx->framebuffer.nr_samples) {
@@ -2107,7 +2132,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		     old_nr_samples != SI_NUM_SMOOTH_AA_SAMPLES) &&
 		    (sctx->framebuffer.nr_samples != SI_NUM_SMOOTH_AA_SAMPLES ||
 		     old_nr_samples != 1))
-			sctx->msaa_sample_locs.dirty = true;
+			si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs);
 	}
 }
 
@@ -2141,20 +2166,24 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 				RADEON_PRIO_COLOR_META);
 		}
 
-		r600_write_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 13);
+		r600_write_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+					   sctx->b.chip_class >= VI ? 14 : 13);
 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
 		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
 		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
 		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
 		radeon_emit(cs, cb->cb_color_info | tex->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
 		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
-		radeon_emit(cs, 0);			/* R_028C78 unused */
+		radeon_emit(cs, cb->cb_dcc_control);	/* R_028C78_CB_COLOR0_DCC_CONTROL */
 		radeon_emit(cs, tex->cmask.base_address_reg);	/* R_028C7C_CB_COLOR0_CMASK */
 		radeon_emit(cs, tex->cmask.slice_tile_max);	/* R_028C80_CB_COLOR0_CMASK_SLICE */
 		radeon_emit(cs, cb->cb_color_fmask);		/* R_028C84_CB_COLOR0_FMASK */
 		radeon_emit(cs, cb->cb_color_fmask_slice);	/* R_028C88_CB_COLOR0_FMASK_SLICE */
 		radeon_emit(cs, tex->color_clear_value[0]);	/* R_028C8C_CB_COLOR0_CLEAR_WORD0 */
 		radeon_emit(cs, tex->color_clear_value[1]);	/* R_028C90_CB_COLOR0_CLEAR_WORD1 */
+
+		if (sctx->b.chip_class >= VI)
+			radeon_emit(cs, 0);	/* R_028C94_CB_COLOR0_DCC_BASE */
 	}
 	/* set CB_COLOR1_INFO for possible dual-src blending */
 	if (i == 1 && state->cbufs[0]) {
@@ -2249,22 +2278,35 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 	sctx->ps_iter_samples = min_samples;
 
 	if (sctx->framebuffer.nr_samples > 1)
-		sctx->msaa_config.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 /*
  * Samplers
  */
 
-static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx,
-							struct pipe_resource *texture,
-							const struct pipe_sampler_view *state)
+/**
+ * Create a sampler view.
+ *
+ * @param ctx		context
+ * @param texture	texture
+ * @param state		sampler view template
+ * @param width0	width0 override (for compressed textures as int)
+ * @param height0	height0 override (for compressed textures as int)
+ * @param force_level   set the base address to the level (for compressed textures)
+ */
+struct pipe_sampler_view *
+si_create_sampler_view_custom(struct pipe_context *ctx,
+			      struct pipe_resource *texture,
+			      const struct pipe_sampler_view *state,
+			      unsigned width0, unsigned height0,
+			      unsigned force_level)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
 	struct r600_texture *tmp = (struct r600_texture*)texture;
 	const struct util_format_description *desc;
-	unsigned format, num_format;
+	unsigned format, num_format, base_level, first_level, last_level;
 	uint32_t pitch = 0;
 	unsigned char state_swizzle[4], swizzle[4];
 	unsigned height, depth, width;
@@ -2297,7 +2339,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 
 	/* Buffer resource. */
 	if (texture->target == PIPE_BUFFER) {
-		unsigned stride;
+		unsigned stride, num_records;
 
 		desc = util_format_description(state->format);
 		first_non_void = util_format_get_first_non_void_channel(state->format);
@@ -2306,10 +2348,16 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 		format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
 		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
 
+		num_records = state->u.buf.last_element + 1 - state->u.buf.first_element;
+		num_records = MIN2(num_records, texture->width0 / stride);
+
+		if (sctx->b.chip_class >= VI)
+			num_records *= stride;
+
 		view->state[4] = va;
 		view->state[5] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 				 S_008F04_STRIDE(stride);
-		view->state[6] = state->u.buf.last_element + 1 - state->u.buf.first_element;
+		view->state[6] = num_records;
 		view->state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
 				 S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
 				 S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
@@ -2437,13 +2485,25 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 		format = 0;
 	}
 
-	/* not supported any more */
-	//endian = si_colorformat_endian_swap(format);
+	base_level = 0;
+	first_level = state->u.tex.first_level;
+	last_level = state->u.tex.last_level;
+	width = width0;
+	height = height0;
+	depth = texture->depth0;
 
-	width = surflevel[0].npix_x;
-	height = surflevel[0].npix_y;
-	depth = surflevel[0].npix_z;
-	pitch = surflevel[0].nblk_x * util_format_get_blockwidth(pipe_format);
+	if (force_level) {
+		assert(force_level == first_level &&
+		       force_level == last_level);
+		base_level = force_level;
+		first_level = 0;
+		last_level = 0;
+		width = u_minify(width, force_level);
+		height = u_minify(height, force_level);
+		depth = u_minify(depth, force_level);
+	}
+
+	pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format);
 
 	if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
@@ -2453,8 +2513,7 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	} else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY)
 		depth = texture->array_size / 6;
 
-	va = tmp->resource.gpu_address + surflevel[0].offset;
-	va += tmp->mipmap_shift * surflevel[texture->last_level].slice_size * tmp->surface.array_size;
+	va = tmp->resource.gpu_address + surflevel[base_level].offset;
 
 	view->state[0] = va >> 8;
 	view->state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) |
@@ -2467,11 +2526,11 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 			  S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
 			  S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
 			  S_008F1C_BASE_LEVEL(texture->nr_samples > 1 ?
-						      0 : state->u.tex.first_level - tmp->mipmap_shift) |
+						      0 : first_level) |
 			  S_008F1C_LAST_LEVEL(texture->nr_samples > 1 ?
 						      util_logbase2(texture->nr_samples) :
-						      state->u.tex.last_level - tmp->mipmap_shift) |
-			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, 0, false)) |
+						      last_level) |
+			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, base_level, false)) |
 			  S_008F1C_POW2_PAD(texture->last_level > 0) |
 			  S_008F1C_TYPE(si_tex_dim(texture->target, texture->nr_samples)));
 	view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
@@ -2523,6 +2582,16 @@ static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx
 	return &view->base;
 }
 
+static struct pipe_sampler_view *
+si_create_sampler_view(struct pipe_context *ctx,
+		       struct pipe_resource *texture,
+		       const struct pipe_sampler_view *state)
+{
+	return si_create_sampler_view_custom(ctx, texture, state,
+					     texture ? texture->width0 : 0,
+					     texture ? texture->height0 : 0, 0);
+}
+
 static void si_sampler_view_destroy(struct pipe_context *ctx,
 				    struct pipe_sampler_view *state)
 {
@@ -2765,6 +2834,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx,
 			pipe_resource_reference(&dsti->buffer, src->buffer);
 			dsti->buffer_offset = src->buffer_offset;
 			dsti->stride = src->stride;
+			r600_context_add_resource_size(ctx, src->buffer);
 		}
 	} else {
 		for (i = 0; i < count; i++) {
@@ -2782,6 +2852,7 @@ static void si_set_index_buffer(struct pipe_context *ctx,
 	if (ib) {
 		pipe_resource_reference(&sctx->index_buffer.buffer, ib->buffer);
 	        memcpy(&sctx->index_buffer, ib, sizeof(*ib));
+		r600_context_add_resource_size(ctx, ib->buffer);
 	} else {
 		pipe_resource_reference(&sctx->index_buffer.buffer, NULL);
 	}
@@ -2845,6 +2916,30 @@ static void si_set_polygon_stipple(struct pipe_context *ctx,
 	}
 }
 
+static void si_set_tess_state(struct pipe_context *ctx,
+			      const float default_outer_level[4],
+			      const float default_inner_level[2])
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct pipe_constant_buffer cb;
+	float array[8];
+
+	memcpy(array, default_outer_level, sizeof(float) * 4);
+	memcpy(array+4, default_inner_level, sizeof(float) * 2);
+
+	cb.buffer = NULL;
+	cb.user_buffer = NULL;
+	cb.buffer_size = sizeof(array);
+
+	si_upload_const_buffer(sctx, (struct r600_resource**)&cb.buffer,
+			       (void*)array, sizeof(array),
+			       &cb.buffer_offset);
+
+	ctx->set_constant_buffer(ctx, PIPE_SHADER_TESS_CTRL,
+				 SI_DRIVER_STATE_CONST_BUF, &cb);
+	pipe_resource_reference(&cb.buffer, NULL);
+}
+
 static void si_texture_barrier(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -2870,6 +2965,8 @@ static void si_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
 	si_need_cs_space((struct si_context*)ctx, num_dw, include_draw_vbo);
 }
 
+static void si_init_config(struct si_context *sctx);
+
 void si_init_state_functions(struct si_context *sctx)
 {
 	si_init_atom(&sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state, 0);
@@ -2920,6 +3017,7 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->b.b.texture_barrier = si_texture_barrier;
 	sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
 	sctx->b.b.set_min_samples = si_set_min_samples;
+	sctx->b.b.set_tess_state = si_set_tess_state;
 
 	sctx->b.set_occlusion_query_state = si_set_occlusion_query_state;
 	sctx->b.need_gfx_cs_space = si_need_gfx_cs_space;
@@ -2931,24 +3029,31 @@ void si_init_state_functions(struct si_context *sctx)
 	} else {
 		sctx->b.dma_copy = si_dma_copy;
 	}
+
+	si_init_config(sctx);
 }
 
 static void
 si_write_harvested_raster_configs(struct si_context *sctx,
 				  struct si_pm4_state *pm4,
-				  unsigned raster_config)
+				  unsigned raster_config,
+				  unsigned raster_config_1)
 {
 	unsigned sh_per_se = MAX2(sctx->screen->b.info.max_sh_per_se, 1);
 	unsigned num_se = MAX2(sctx->screen->b.info.max_se, 1);
 	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
-	unsigned num_rb = sctx->screen->b.info.r600_num_backends;
-	unsigned rb_per_pkr = num_rb / num_se / sh_per_se;
+	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
+	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
 	unsigned rb_per_se = num_rb / num_se;
-	unsigned se0_mask = (1 << rb_per_se) - 1;
-	unsigned se1_mask = se0_mask << rb_per_se;
+	unsigned se_mask[4];
 	unsigned se;
 
-	assert(num_se == 1 || num_se == 2);
+	se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
+	se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
+	se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
+	se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
+
+	assert(num_se == 1 || num_se == 2 || num_se == 4);
 	assert(sh_per_se == 1 || sh_per_se == 2);
 	assert(rb_per_pkr == 1 || rb_per_pkr == 2);
 
@@ -2956,17 +3061,16 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 	 * fields are for, so I'm leaving them as their default
 	 * values. */
 
-	se0_mask &= rb_mask;
-	se1_mask &= rb_mask;
-	if (num_se == 2 && (!se0_mask || !se1_mask)) {
-		raster_config &= C_028350_SE_MAP;
+	if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
+			     (!se_mask[2] && !se_mask[3]))) {
+		raster_config_1 &= C_028354_SE_PAIR_MAP;
 
-		if (!se0_mask) {
-			raster_config |=
-				S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+		if (!se_mask[0] && !se_mask[1]) {
+			raster_config_1 |=
+				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
 		} else {
-			raster_config |=
-				S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+			raster_config_1 |=
+				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
 		}
 	}
 
@@ -2974,10 +3078,23 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 		unsigned raster_config_se = raster_config;
 		unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
 		unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
+		int idx = (se / 2) * 2;
+
+		if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
+			raster_config_se &= C_028350_SE_MAP;
+
+			if (!se_mask[idx]) {
+				raster_config_se |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+			} else {
+				raster_config_se |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+			}
+		}
 
 		pkr0_mask &= rb_mask;
 		pkr1_mask &= rb_mask;
-		if (sh_per_se == 2 && (!pkr0_mask || !pkr1_mask)) {
+		if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
 			raster_config_se &= C_028350_PKR_MAP;
 
 			if (!pkr0_mask) {
@@ -2989,7 +3106,7 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 			}
 		}
 
-		if (rb_per_pkr == 2) {
+		if (rb_per_se >= 2) {
 			unsigned rb0_mask = 1 << (se * rb_per_se);
 			unsigned rb1_mask = rb0_mask << 1;
 
@@ -3007,7 +3124,7 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 				}
 			}
 
-			if (sh_per_se == 2) {
+			if (rb_per_se > 2) {
 				rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
 				rb1_mask = rb0_mask << 1;
 				rb0_mask &= rb_mask;
@@ -3026,19 +3143,28 @@ si_write_harvested_raster_configs(struct si_context *sctx,
 			}
 		}
 
-		si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
-			       SE_INDEX(se) | SH_BROADCAST_WRITES |
-			       INSTANCE_BROADCAST_WRITES);
+		/* GRBM_GFX_INDEX is privileged on VI */
+		if (sctx->b.chip_class <= CIK)
+			si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
+				       SE_INDEX(se) | SH_BROADCAST_WRITES |
+				       INSTANCE_BROADCAST_WRITES);
 		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se);
+		if (sctx->b.chip_class >= CIK)
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
 	}
 
-	si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
-		       SE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
-		       INSTANCE_BROADCAST_WRITES);
+	/* GRBM_GFX_INDEX is privileged on VI */
+	if (sctx->b.chip_class <= CIK)
+		si_pm4_set_reg(pm4, GRBM_GFX_INDEX,
+			       SE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
+			       INSTANCE_BROADCAST_WRITES);
 }
 
-void si_init_config(struct si_context *sctx)
+static void si_init_config(struct si_context *sctx)
 {
+	unsigned num_rb = MIN2(sctx->screen->b.info.r600_num_backends, 16);
+	unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
+	unsigned raster_config, raster_config_1;
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 
 	if (pm4 == NULL)
@@ -3046,24 +3172,18 @@ void si_init_config(struct si_context *sctx)
 
 	si_cmd_context_control(pm4);
 
-	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, 0x0);
-	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, 0x0);
+	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
+	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
 
 	/* FIXME calculate these values somehow ??? */
 	si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, 0x80);
 	si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
 	si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
 
-	si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0x0);
 	si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0);
 	si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
 
-	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0);
-	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0);
-	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0);
-	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT, 0);
-
 	si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
 	si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
@@ -3076,62 +3196,78 @@ void si_init_config(struct si_context *sctx)
 
 	si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
 
-	if (sctx->b.chip_class >= CIK) {
-		switch (sctx->screen->b.family) {
-		case CHIP_BONAIRE:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x16000012);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0);
-			break;
-		case CHIP_HAWAII:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0x3a00161a);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0x0000002e);
-			break;
-		case CHIP_KAVERI:
-			/* XXX todo */
-		case CHIP_KABINI:
-			/* XXX todo */
-		case CHIP_MULLINS:
-			/* XXX todo */
-		default:
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, 0);
-			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, 0);
-			break;
-		}
+	switch (sctx->screen->b.family) {
+	case CHIP_TAHITI:
+	case CHIP_PITCAIRN:
+		raster_config = 0x2a00126a;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_VERDE:
+		raster_config = 0x0000124a;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_OLAND:
+		raster_config = 0x00000082;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_HAINAN:
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_BONAIRE:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_HAWAII:
+		raster_config = 0x3a00161a;
+		raster_config_1 = 0x0000002e;
+		break;
+	case CHIP_FIJI:
+		/* Fiji should be same as Hawaii, but that causes corruption in some cases */
+		raster_config = 0x16000012; /* 0x3a00161a */
+		raster_config_1 = 0x0000002a; /* 0x0000002e */
+		break;
+	case CHIP_TONGA:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+		break;
+	case CHIP_ICELAND:
+		raster_config = 0x00000002;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_CARRIZO:
+		raster_config = 0x00000002;
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_KAVERI:
+		/* KV should be 0x00000002, but that causes problems with radeon */
+		raster_config = 0x00000000; /* 0x00000002 */
+		raster_config_1 = 0x00000000;
+		break;
+	case CHIP_KABINI:
+	case CHIP_MULLINS:
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	default:
+		fprintf(stderr,
+			"radeonsi: Unknown GPU, using 0 for raster_config\n");
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	}
+
+	/* Always use the default config when all backends are enabled
+	 * (or when we failed to determine the enabled backends).
+	 */
+	if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
+		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
+			       raster_config);
+		if (sctx->b.chip_class >= CIK)
+			si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1,
+				       raster_config_1);
 	} else {
-		unsigned rb_mask = sctx->screen->b.info.si_backend_enabled_mask;
-		unsigned num_rb = sctx->screen->b.info.r600_num_backends;
-		unsigned raster_config;
-
-		switch (sctx->screen->b.family) {
-		case CHIP_TAHITI:
-		case CHIP_PITCAIRN:
-			raster_config = 0x2a00126a;
-			break;
-		case CHIP_VERDE:
-			raster_config = 0x0000124a;
-			break;
-		case CHIP_OLAND:
-			raster_config = 0x00000082;
-			break;
-		case CHIP_HAINAN:
-			raster_config = 0;
-			break;
-		default:
-			fprintf(stderr,
-				"radeonsi: Unknown GPU, using 0 for raster_config\n");
-			raster_config = 0;
-			break;
-		}
-
-		/* Always use the default config when all backends are enabled
-		 * (or when we failed to determine the enabled backends).
-		 */
-		if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
-			si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG,
-				       raster_config);
-		} else {
-			si_write_harvested_raster_configs(sctx, pm4, raster_config);
-		}
+		si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
 	}
 
 	si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
@@ -3153,8 +3289,6 @@ void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0));
-	si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, 0);
-	si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, 0);
 	si_pm4_set_reg(pm4, R_028028_DB_STENCIL_CLEAR, 0);
 	si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
 	si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
@@ -3173,10 +3307,21 @@ void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
 
 	if (sctx->b.chip_class >= CIK) {
+		si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xfffc));
+		si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
+		si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xfffe));
+		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
 		si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
 		si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(0));
 		si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
 	}
 
+	if (sctx->b.chip_class >= VI) {
+		si_pm4_set_reg(pm4, R_028424_CB_DCC_CONTROL,
+			       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1));
+		si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 30);
+		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
+	}
+
 	sctx->init_config = pm4;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 5e68b162137..b8f63c5dd36 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -30,6 +30,8 @@
 #include "si_pm4.h"
 #include "radeon/r600_pipe_common.h"
 
+#define SI_NUM_SHADERS (PIPE_SHADER_TESS_EVAL+1)
+
 struct si_screen;
 struct si_shader;
 
@@ -92,18 +94,21 @@ union si_state {
 		struct si_pm4_state		*blend_color;
 		struct si_pm4_state		*clip;
 		struct si_state_sample_mask	*sample_mask;
-		struct si_state_scissor		*scissor;
-		struct si_state_viewport	*viewport;
+		struct si_state_scissor		*scissor[16];
+		struct si_state_viewport	*viewport[16];
 		struct si_state_rasterizer	*rasterizer;
 		struct si_state_dsa		*dsa;
 		struct si_pm4_state		*fb_rs;
 		struct si_pm4_state		*fb_blend;
 		struct si_pm4_state		*dsa_stencil_ref;
 		struct si_pm4_state		*ta_bordercolor_base;
+		struct si_pm4_state		*ls;
+		struct si_pm4_state		*hs;
 		struct si_pm4_state		*es;
 		struct si_pm4_state		*gs;
 		struct si_pm4_state		*gs_rings;
-		struct si_pm4_state		*gs_onoff;
+		struct si_pm4_state		*tf_ring;
+		struct si_pm4_state		*vgt_shader_config;
 		struct si_pm4_state		*vs;
 		struct si_pm4_state		*ps;
 		struct si_pm4_state		*spi;
@@ -111,6 +116,11 @@ union si_state {
 	struct si_pm4_state	*array[0];
 };
 
+struct si_shader_data {
+	struct r600_atom	atom;
+	uint32_t		sh_base[SI_NUM_SHADERS];
+};
+
 #define SI_NUM_USER_SAMPLERS            16 /* AKA OpenGL textures units per shader */
 #define SI_POLY_STIPPLE_SAMPLER         SI_NUM_USER_SAMPLERS
 #define SI_NUM_SAMPLERS                 (SI_POLY_STIPPLE_SAMPLER + 1)
@@ -135,68 +145,61 @@ union si_state {
  * Ring buffers:        0..1
  * Streamout buffers:   2..5
  */
-#define SI_RING_ESGS		0
-#define SI_RING_GSVS		1
-#define SI_NUM_RING_BUFFERS	2
+#define SI_RING_TESS_FACTOR	0 /* for HS (TCS)  */
+#define SI_RING_ESGS		0 /* for ES, GS */
+#define SI_RING_GSVS		1 /* for GS, VS */
+#define SI_RING_GSVS_1		2 /* 1, 2, 3 for GS */
+#define SI_RING_GSVS_2		3
+#define SI_RING_GSVS_3		4
+#define SI_NUM_RING_BUFFERS	5
 #define SI_SO_BUF_OFFSET	SI_NUM_RING_BUFFERS
 #define SI_NUM_RW_BUFFERS	(SI_SO_BUF_OFFSET + 4)
 
 #define SI_NUM_VERTEX_BUFFERS	16
 
 
-/* This represents resource descriptors in memory, such as buffer resources,
+/* This represents descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
  */
 struct si_descriptors {
-	struct r600_atom atom;
-
-	/* The size of one resource descriptor. */
+	/* The list of descriptors in malloc'd memory. */
+	uint32_t *list;
+	/* The size of one descriptor. */
 	unsigned element_dw_size;
-	/* The maximum number of resource descriptors. */
+	/* The maximum number of descriptors. */
 	unsigned num_elements;
+	/* Whether the list has been changed and should be re-uploaded. */
+	bool list_dirty;
 
-	/* The buffer where resource descriptors are stored. */
+	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
 	unsigned buffer_offset;
 
-	/* The i-th bit is set if that element is dirty (changed but not emitted). */
-	uint64_t dirty_mask;
 	/* The i-th bit is set if that element is enabled (non-NULL resource). */
 	uint64_t enabled_mask;
 
-	/* We can't update descriptors directly because the GPU might be
-	 * reading them at the same time, so we have to update them
-	 * in a copy-on-write manner. Each such copy is called a context,
-	 * which is just another array descriptors in the same buffer. */
-	unsigned current_context_id;
-	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
-	unsigned context_size;
-
-	/* The shader userdata register where the 64-bit pointer to the descriptor
+	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
-	unsigned shader_userdata_reg;
+	unsigned shader_userdata_offset;
+	/* Whether the pointer should be re-emitted. */
+	bool pointer_dirty;
 };
 
 struct si_sampler_views {
 	struct si_descriptors		desc;
 	struct pipe_sampler_view	*views[SI_NUM_SAMPLER_VIEWS];
-	uint32_t			*desc_data[SI_NUM_SAMPLER_VIEWS];
 };
 
 struct si_sampler_states {
 	struct si_descriptors		desc;
-	uint32_t			*desc_data[SI_NUM_SAMPLER_STATES];
 	void				*saved_states[2]; /* saved for u_blitter */
 };
 
 struct si_buffer_resources {
 	struct si_descriptors		desc;
-	unsigned			num_buffers;
 	enum radeon_bo_usage		shader_usage; /* READ, WRITE, or READWRITE */
 	enum radeon_bo_priority		priority;
 	struct pipe_resource		**buffers; /* this has num_buffers elements */
-	uint32_t			*desc_storage; /* this has num_buffers*4 elements */
-	uint32_t			**desc_data; /* an array of pointers pointing to desc_storage */
 };
 
 #define si_pm4_block_idx(member) \
@@ -232,20 +235,18 @@ struct si_buffer_resources {
 /* si_descriptors.c */
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 				unsigned start, unsigned count, void **states);
-void si_update_vertex_buffers(struct si_context *sctx);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
-			unsigned element_size, unsigned index_stride);
+			unsigned element_size, unsigned index_stride, uint64_t offset);
 void si_init_all_descriptors(struct si_context *sctx);
+bool si_upload_shader_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
+void si_shader_change_notify(struct si_context *sctx);
 
 /* si_state.c */
 struct si_shader_selector;
@@ -256,7 +257,6 @@ boolean si_is_format_supported(struct pipe_screen *screen,
                                unsigned sample_count,
                                unsigned usage);
 void si_init_state_functions(struct si_context *sctx);
-void si_init_config(struct si_context *sctx);
 unsigned cik_bank_wh(unsigned bankwh);
 unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode);
 unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect);
@@ -264,6 +264,12 @@ unsigned cik_tile_split(unsigned tile_split);
 unsigned si_array_mode(unsigned mode);
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex);
 unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil);
+struct pipe_sampler_view *
+si_create_sampler_view_custom(struct pipe_context *ctx,
+			      struct pipe_resource *texture,
+			      const struct pipe_sampler_view *state,
+			      unsigned width0, unsigned height0,
+			      unsigned force_level);
 
 /* si_state_shader.c */
 void si_update_shaders(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 2e77d85a80d..4c21655596c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -31,6 +31,7 @@
 
 #include "util/u_index_modify.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_prim.h"
 
 static void si_decompress_textures(struct si_context *sctx)
 {
@@ -64,6 +65,7 @@ static unsigned si_conv_pipe_prim(unsigned mode)
 		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_008958_DI_PT_LINESTRIP_ADJ,
 		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_008958_DI_PT_TRILIST_ADJ,
 		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_008958_DI_PT_TRISTRIP_ADJ,
+		[PIPE_PRIM_PATCHES]			= V_008958_DI_PT_PATCH,
 		[R600_PRIM_RECTANGLE_LIST]		= V_008958_DI_PT_RECTLIST
         };
 	assert(mode < Elements(prim_conv));
@@ -87,6 +89,7 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode)
 		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
 		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
 		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
 		[R600_PRIM_RECTANGLE_LIST]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP
 	};
 	assert(mode < Elements(prim_conv));
@@ -94,8 +97,128 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode)
 	return prim_conv[mode];
 }
 
+/**
+ * This calculates the LDS size for tessellation shaders (VS, TCS, TES).
+ * LS.LDS_SIZE is shared by all 3 shader stages.
+ *
+ * The information about LDS and other non-compile-time parameters is then
+ * written to userdata SGPRs.
+ */
+static void si_emit_derived_tess_state(struct si_context *sctx,
+				       const struct pipe_draw_info *info,
+				       unsigned *num_patches)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct si_shader_selector *ls = sctx->vs_shader;
+	/* The TES pointer will only be used for sctx->last_tcs.
+	 * It would be wrong to think that TCS = TES. */
+	struct si_shader_selector *tcs =
+		sctx->tcs_shader ? sctx->tcs_shader : sctx->tes_shader;
+	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
+	unsigned num_tcs_input_cp = info->vertices_per_patch;
+	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
+	unsigned num_tcs_patch_outputs;
+	unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
+	unsigned input_patch_size, output_patch_size, output_patch0_offset;
+	unsigned perpatch_output_offset, lds_size, ls_rsrc2;
+	unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
+
+	*num_patches = 1; /* TODO: calculate this */
+
+	if (sctx->last_ls == ls->current &&
+	    sctx->last_tcs == tcs &&
+	    sctx->last_tes_sh_base == tes_sh_base &&
+	    sctx->last_num_tcs_input_cp == num_tcs_input_cp)
+		return;
+
+	sctx->last_ls = ls->current;
+	sctx->last_tcs = tcs;
+	sctx->last_tes_sh_base = tes_sh_base;
+	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
+
+	/* This calculates how shader inputs and outputs among VS, TCS, and TES
+	 * are laid out in LDS. */
+	num_tcs_inputs = util_last_bit64(ls->outputs_written);
+
+	if (sctx->tcs_shader) {
+		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
+		num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+		num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
+	} else {
+		/* No TCS. Route varyings from LS to TES. */
+		num_tcs_outputs = num_tcs_inputs;
+		num_tcs_output_cp = num_tcs_input_cp;
+		num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
+	}
+
+	input_vertex_size = num_tcs_inputs * 16;
+	output_vertex_size = num_tcs_outputs * 16;
+
+	input_patch_size = num_tcs_input_cp * input_vertex_size;
+
+	pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
+	output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+
+	output_patch0_offset = sctx->tcs_shader ? input_patch_size * *num_patches : 0;
+	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
+
+	lds_size = output_patch0_offset + output_patch_size * *num_patches;
+	ls_rsrc2 = ls->current->ls_rsrc2;
+
+	if (sctx->b.chip_class >= CIK) {
+		assert(lds_size <= 65536);
+		ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 512) / 512);
+	} else {
+		assert(lds_size <= 32768);
+		ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256);
+	}
+
+	/* Due to a hw bug, RSRC2_LS must be written twice with another
+	 * LS register written in between. */
+	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
+		si_write_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+	si_write_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+	radeon_emit(cs, ls->current->ls_rsrc1);
+	radeon_emit(cs, ls_rsrc2);
+
+	/* Compute userdata SGPRs. */
+	assert(((input_vertex_size / 4) & ~0xff) == 0);
+	assert(((output_vertex_size / 4) & ~0xff) == 0);
+	assert(((input_patch_size / 4) & ~0x1fff) == 0);
+	assert(((output_patch_size / 4) & ~0x1fff) == 0);
+	assert(((output_patch0_offset / 16) & ~0xffff) == 0);
+	assert(((perpatch_output_offset / 16) & ~0xffff) == 0);
+	assert(num_tcs_input_cp <= 32);
+	assert(num_tcs_output_cp <= 32);
+
+	tcs_in_layout = (input_patch_size / 4) |
+			((input_vertex_size / 4) << 13);
+	tcs_out_layout = (output_patch_size / 4) |
+			 ((output_vertex_size / 4) << 13);
+	tcs_out_offsets = (output_patch0_offset / 16) |
+			  ((perpatch_output_offset / 16) << 16);
+
+	/* Set them for LS. */
+	si_write_sh_reg(cs,
+		R_00B530_SPI_SHADER_USER_DATA_LS_0 + SI_SGPR_LS_OUT_LAYOUT * 4,
+		tcs_in_layout);
+
+	/* Set them for TCS. */
+	si_write_sh_reg_seq(cs,
+		R_00B430_SPI_SHADER_USER_DATA_HS_0 + SI_SGPR_TCS_OUT_OFFSETS * 4, 3);
+	radeon_emit(cs, tcs_out_offsets);
+	radeon_emit(cs, tcs_out_layout | (num_tcs_input_cp << 26));
+	radeon_emit(cs, tcs_in_layout);
+
+	/* Set them for TES. */
+	si_write_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TCS_OUT_OFFSETS * 4, 2);
+	radeon_emit(cs, tcs_out_offsets);
+	radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
+}
+
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
-					  const struct pipe_draw_info *info)
+					  const struct pipe_draw_info *info,
+					  unsigned num_patches)
 {
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned prim = info->mode;
@@ -104,11 +227,41 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	/* SWITCH_ON_EOP(0) is always preferable. */
 	bool wd_switch_on_eop = false;
 	bool ia_switch_on_eop = false;
+	bool ia_switch_on_eoi = false;
 	bool partial_vs_wave = false;
+	bool partial_es_wave = false;
 
 	if (sctx->gs_shader)
 		primgroup_size = 64; /* recommended with a GS */
 
+	if (sctx->tes_shader) {
+		unsigned num_cp_out =
+			sctx->tcs_shader ?
+			sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+			info->vertices_per_patch;
+		unsigned max_size = 256 / MAX2(info->vertices_per_patch, num_cp_out);
+
+		primgroup_size = MIN2(primgroup_size, max_size);
+
+		/* primgroup_size must be set to a multiple of NUM_PATCHES */
+		primgroup_size = (primgroup_size / num_patches) * num_patches;
+
+		/* SWITCH_ON_EOI must be set if PrimID is used.
+		 * If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+		if ((sctx->tcs_shader && sctx->tcs_shader->info.uses_primid) ||
+		    sctx->tes_shader->info.uses_primid) {
+			ia_switch_on_eoi = true;
+			partial_es_wave = true;
+		}
+
+		/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
+		if ((sctx->b.family == CHIP_TAHITI ||
+		     sctx->b.family == CHIP_PITCAIRN ||
+		     sctx->b.family == CHIP_BONAIRE) &&
+		    sctx->gs_shader)
+			partial_vs_wave = true;
+	}
+
 	/* This is a hardware requirement. */
 	if ((rs && rs->line_stipple_enable) ||
 	    (sctx->b.screen->debug_flags & DBG_SWITCH_ON_EOP)) {
@@ -139,14 +292,52 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		    (info->indirect || info->instance_count > 1))
 			wd_switch_on_eop = true;
 
+		/* USE_OPAQUE doesn't work when WD_SWITCH_ON_EOP is 0. */
+		if (info->count_from_stream_output)
+			wd_switch_on_eop = true;
+
 		/* If the WD switch is false, the IA switch must be false too. */
 		assert(wd_switch_on_eop || !ia_switch_on_eop);
 	}
 
+	/* Hw bug with single-primitive instances and SWITCH_ON_EOI
+	 * on multi-SE chips. */
+	if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
+	    (info->indirect ||
+	     (info->instance_count > 1 &&
+	      u_prims_for_vertices(info->mode, info->count) <= 1)))
+		sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
+
+	/* Instancing bug on 2 SE chips. */
+	if (sctx->b.screen->info.max_se == 2 && ia_switch_on_eoi &&
+	    (info->indirect || info->instance_count > 1))
+		partial_vs_wave = true;
+
 	return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
+		S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
 		S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
+		S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
 		S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
-		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0);
+		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0) |
+		S_028AA8_MAX_PRIMGRP_IN_WAVE(sctx->b.chip_class >= VI ? 2 : 0);
+}
+
+static unsigned si_get_ls_hs_config(struct si_context *sctx,
+				    const struct pipe_draw_info *info,
+				    unsigned num_patches)
+{
+	unsigned num_output_cp;
+
+	if (!sctx->tes_shader)
+		return 0;
+
+	num_output_cp = sctx->tcs_shader ?
+		sctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
+		info->vertices_per_patch;
+
+	return S_028B58_NUM_PATCHES(num_patches) |
+		S_028B58_HS_NUM_INPUT_CP(info->vertices_per_patch) |
+		S_028B58_HS_NUM_OUTPUT_CP(num_output_cp);
 }
 
 static void si_emit_scratch_reloc(struct si_context *sctx)
@@ -202,22 +393,31 @@ static void si_emit_draw_registers(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
-	unsigned ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info);
+	unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
+
+	if (sctx->tes_shader)
+		si_emit_derived_tess_state(sctx, info, &num_patches);
+
+	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
+	ls_hs_config = si_get_ls_hs_config(sctx, info, num_patches);
 
 	/* Draw state. */
 	if (prim != sctx->last_prim ||
-	    ia_multi_vgt_param != sctx->last_multi_vgt_param) {
+	    ia_multi_vgt_param != sctx->last_multi_vgt_param ||
+	    ls_hs_config != sctx->last_ls_hs_config) {
 		if (sctx->b.chip_class >= CIK) {
 			radeon_emit(cs, PKT3(PKT3_DRAW_PREAMBLE, 2, 0));
 			radeon_emit(cs, prim); /* VGT_PRIMITIVE_TYPE */
 			radeon_emit(cs, ia_multi_vgt_param); /* IA_MULTI_VGT_PARAM */
-			radeon_emit(cs, 0); /* VGT_LS_HS_CONFIG */
+			radeon_emit(cs, ls_hs_config); /* VGT_LS_HS_CONFIG */
 		} else {
 			r600_write_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim);
 			r600_write_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+			r600_write_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
 		}
 		sctx->last_prim = prim;
 		sctx->last_multi_vgt_param = ia_multi_vgt_param;
+		sctx->last_ls_hs_config = ls_hs_config;
 	}
 
 	if (gs_out_prim != sctx->last_gs_out_prim) {
@@ -245,8 +445,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 const struct pipe_index_buffer *ib)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	unsigned sh_base_reg = (sctx->gs_shader ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-						  R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
 
 	if (info->count_from_stream_output) {
 		struct r600_so_target *t =
@@ -275,12 +474,24 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	if (info->indexed) {
 		radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
 
-		if (ib->index_size == 4) {
-			radeon_emit(cs, V_028A7C_VGT_INDEX_32 | (SI_BIG_ENDIAN ?
-					V_028A7C_VGT_DMA_SWAP_32_BIT : 0));
-		} else {
-			radeon_emit(cs, V_028A7C_VGT_INDEX_16 | (SI_BIG_ENDIAN ?
-					V_028A7C_VGT_DMA_SWAP_16_BIT : 0));
+		/* index type */
+		switch (ib->index_size) {
+		case 1:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_8);
+			break;
+		case 2:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_16 |
+				    (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
+					     V_028A7C_VGT_DMA_SWAP_16_BIT : 0));
+			break;
+		case 4:
+			radeon_emit(cs, V_028A7C_VGT_INDEX_32 |
+				    (SI_BIG_ENDIAN && sctx->b.chip_class <= CIK ?
+					     V_028A7C_VGT_DMA_SWAP_32_BIT : 0));
+			break;
+		default:
+			assert(!"unreachable");
+			return;
 		}
 	}
 
@@ -406,9 +617,14 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato
 
 	if (sctx->flags & SI_CONTEXT_INV_TC_L1)
 		cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_TC_L2)
+	if (sctx->flags & SI_CONTEXT_INV_TC_L2) {
 		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
 
+		/* TODO: this might not be needed. */
+		if (sctx->chip_class >= VI)
+			cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
+	}
+
 	if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
 		cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
 				 S_0085F0_CB0_DEST_BASE_ENA(1) |
@@ -520,8 +736,14 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	    (info->indexed || !info->count_from_stream_output))
 		return;
 
-	if (!sctx->ps_shader || !sctx->vs_shader)
+	if (!sctx->ps_shader || !sctx->vs_shader) {
+		assert(0);
 		return;
+	}
+	if (!!sctx->tes_shader != (info->mode == PIPE_PRIM_PATCHES)) {
+		assert(0);
+		return;
+	}
 
 	si_decompress_textures(sctx);
 
@@ -532,15 +754,15 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	 * current_rast_prim for this draw_vbo call. */
 	if (sctx->gs_shader)
 		sctx->current_rast_prim = sctx->gs_shader->gs_output_prim;
+	else if (sctx->tes_shader)
+		sctx->current_rast_prim =
+			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 	else
 		sctx->current_rast_prim = info->mode;
 
 	si_update_shaders(sctx);
-
-	if (sctx->vertex_buffers_dirty) {
-		si_update_vertex_buffers(sctx);
-		sctx->vertex_buffers_dirty = false;
-	}
+	if (!si_upload_shader_descriptors(sctx))
+		return;
 
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */
@@ -550,7 +772,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		ib.offset = sctx->index_buffer.offset;
 
 		/* Translate or upload, if needed. */
-		if (ib.index_size == 1) {
+		/* 8-bit indices are supported on VI. */
+		if (sctx->b.chip_class <= CIK && ib.index_size == 1) {
 			struct pipe_resource *out_buffer = NULL;
 			unsigned out_offset, start, count, start_offset;
 			void *ptr;
@@ -585,6 +808,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		}
 	}
 
+	/* TODO: VI should read index buffers through TC, so this shouldn't be
+	 * needed on VI. */
 	if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) {
 		sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
 		r600_resource(ib.buffer)->TC_L2_dirty = false;
@@ -592,7 +817,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	/* Check flush flags. */
 	if (sctx->b.flags)
-		sctx->atoms.s.cache_flush->dirty = true;
+		si_mark_atom_dirty(sctx, sctx->atoms.s.cache_flush);
 
 	si_need_cs_space(sctx, 0, TRUE);
 
@@ -618,7 +843,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
-	if (sctx->b.family == CHIP_HAWAII &&
+	if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) &&
 	    (sctx->b.streamout.streamout_enabled ||
 	     sctx->b.streamout.prims_gen_query_enabled)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 208c8523ef1..0347014948d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -30,9 +30,135 @@
 #include "sid.h"
 
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_ureg.h"
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
+static void si_set_tesseval_regs(struct si_shader *shader,
+				 struct si_pm4_state *pm4)
+{
+	struct tgsi_shader_info *info = &shader->selector->info;
+	unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
+	unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
+	bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
+	bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
+	unsigned type, partitioning, topology;
+
+	switch (tes_prim_mode) {
+	case PIPE_PRIM_LINES:
+		type = V_028B6C_TESS_ISOLINE;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		type = V_028B6C_TESS_TRIANGLE;
+		break;
+	case PIPE_PRIM_QUADS:
+		type = V_028B6C_TESS_QUAD;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	switch (tes_spacing) {
+	case PIPE_TESS_SPACING_FRACTIONAL_ODD:
+		partitioning = V_028B6C_PART_FRAC_ODD;
+		break;
+	case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
+		partitioning = V_028B6C_PART_FRAC_EVEN;
+		break;
+	case PIPE_TESS_SPACING_EQUAL:
+		partitioning = V_028B6C_PART_INTEGER;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	if (tes_point_mode)
+		topology = V_028B6C_OUTPUT_POINT;
+	else if (tes_prim_mode == PIPE_PRIM_LINES)
+		topology = V_028B6C_OUTPUT_LINE;
+	else if (tes_vertex_order_cw)
+		/* for some reason, this must be the other way around */
+		topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
+	else
+		topology = V_028B6C_OUTPUT_TRIANGLE_CW;
+
+	si_pm4_set_reg(pm4, R_028B6C_VGT_TF_PARAM,
+		       S_028B6C_TYPE(type) |
+		       S_028B6C_PARTITIONING(partitioning) |
+		       S_028B6C_TOPOLOGY(topology));
+}
+
+static void si_shader_ls(struct si_shader *shader)
+{
+	struct si_pm4_state *pm4;
+	unsigned num_sgprs, num_user_sgprs;
+	unsigned vgpr_comp_cnt;
+	uint64_t va;
+
+	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (pm4 == NULL)
+		return;
+
+	va = shader->bo->gpu_address;
+	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
+
+	/* We need at least 2 components for LS.
+	 * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
+	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
+
+	num_user_sgprs = SI_LS_NUM_USER_SGPR;
+	num_sgprs = shader->num_sgprs;
+	if (num_user_sgprs > num_sgprs) {
+		/* Last 2 reserved SGPRs are used for VCC */
+		num_sgprs = num_user_sgprs + 2;
+	}
+	assert(num_sgprs <= 104);
+
+	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
+
+	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
+		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt);
+	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
+}
+
+static void si_shader_hs(struct si_shader *shader)
+{
+	struct si_pm4_state *pm4;
+	unsigned num_sgprs, num_user_sgprs;
+	uint64_t va;
+
+	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (pm4 == NULL)
+		return;
+
+	va = shader->bo->gpu_address;
+	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
+
+	num_user_sgprs = SI_TCS_NUM_USER_SGPR;
+	num_sgprs = shader->num_sgprs;
+	/* One SGPR after user SGPRs is pre-loaded with tessellation factor
+	 * buffer offset. */
+	if ((num_user_sgprs + 1) > num_sgprs) {
+		/* Last 2 reserved SGPRs are used for VCC */
+		num_sgprs = num_user_sgprs + 1 + 2;
+	}
+	assert(num_sgprs <= 104);
+
+	si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
+	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
+		       S_00B428_VGPRS((shader->num_vgprs - 1) / 4) |
+		       S_00B428_SGPRS((num_sgprs - 1) / 8));
+	si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
+		       S_00B42C_USER_SGPR(num_user_sgprs) |
+		       S_00B42C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+}
+
 static void si_shader_es(struct si_shader *shader)
 {
 	struct si_pm4_state *pm4;
@@ -48,9 +174,15 @@ static void si_shader_es(struct si_shader *shader)
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-	vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+	if (shader->selector->type == PIPE_SHADER_VERTEX) {
+		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+		num_user_sgprs = SI_VS_NUM_USER_SGPR;
+	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+		vgpr_comp_cnt = 3; /* all components are needed for TES */
+		num_user_sgprs = SI_TES_NUM_USER_SGPR;
+	} else
+		assert(0);
 
-	num_user_sgprs = SI_VS_NUM_USER_SGPR;
 	num_sgprs = shader->num_sgprs;
 	/* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
 	if ((num_user_sgprs + 1) > num_sgprs) {
@@ -69,17 +201,37 @@ static void si_shader_es(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
 		       S_00B32C_USER_SGPR(num_user_sgprs) |
 		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
+
+	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		si_set_tesseval_regs(shader, pm4);
+}
+
+static unsigned si_gs_get_max_stream(struct si_shader *shader)
+{
+	struct pipe_stream_output_info *so = &shader->selector->so;
+	unsigned max_stream = 0, i;
+
+	if (so->num_outputs == 0)
+		return 0;
+
+	for (i = 0; i < so->num_outputs; i++) {
+		if (so->output[i].stream > max_stream)
+			max_stream = so->output[i].stream;
+	}
+	return max_stream;
 }
 
 static void si_shader_gs(struct si_shader *shader)
 {
-	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 >> 2);
+	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
 	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
-	unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+	unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
+	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
 	uint64_t va;
+	unsigned max_stream = si_gs_get_max_stream(shader);
 
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
 	assert(gsvs_itemsize < (1 << 15));
@@ -107,16 +259,23 @@ static void si_shader_gs(struct si_shader *shader)
 		       S_028A40_GS_WRITE_OPTIMIZE(1));
 
 	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
-	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize);
-	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize);
+	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
+	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
 
 	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-		       util_bitcount64(shader->selector->gs_used_inputs) * (16 >> 2));
-	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize);
+		       util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
+	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
 
 	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
 
-	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize);
+	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
+	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
+	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0);
+	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0);
+
+	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
+		       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
+		       S_028B90_ENABLE(gs_num_invocations > 0));
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
@@ -143,19 +302,29 @@ static void si_shader_gs(struct si_shader *shader)
 
 static void si_shader_vs(struct si_shader *shader)
 {
-	struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
-	unsigned nparams, i, vgpr_comp_cnt;
+	unsigned nparams, vgpr_comp_cnt;
 	uint64_t va;
 	unsigned window_space =
 	   shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+	bool enable_prim_id = si_vs_exports_prim_id(shader);
 
 	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 
 	if (pm4 == NULL)
 		return;
 
+	/* If this is the GS copy shader, the GS state writes this register.
+	 * Otherwise, the VS state writes it.
+	 */
+	if (!shader->is_gs_copy_shader) {
+		si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
+			       S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0));
+		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id);
+	} else
+		si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0);
+
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
@@ -163,8 +332,11 @@ static void si_shader_vs(struct si_shader *shader)
 		vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
 		num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_VERTEX) {
-		vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
+		vgpr_comp_cnt = shader->uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
 		num_user_sgprs = SI_VS_NUM_USER_SGPR;
+	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
+		vgpr_comp_cnt = 3; /* all components are needed for TES */
+		num_user_sgprs = SI_TES_NUM_USER_SGPR;
 	} else
 		assert(0);
 
@@ -175,28 +347,8 @@ static void si_shader_vs(struct si_shader *shader)
 	}
 	assert(num_sgprs <= 104);
 
-	/* Certain attributes (position, psize, etc.) don't count as params.
-	 * VS is required to export at least one param and r600_shader_from_tgsi()
-	 * takes care of adding a dummy export.
-	 */
-	for (nparams = 0, i = 0 ; i < info->num_outputs; i++) {
-		switch (info->output_semantic_name[i]) {
-		case TGSI_SEMANTIC_CLIPVERTEX:
-		case TGSI_SEMANTIC_CLIPDIST:
-		case TGSI_SEMANTIC_CULLDIST:
-		case TGSI_SEMANTIC_POSITION:
-		case TGSI_SEMANTIC_PSIZE:
-		case TGSI_SEMANTIC_EDGEFLAG:
-		case TGSI_SEMANTIC_VIEWPORT_INDEX:
-		case TGSI_SEMANTIC_LAYER:
-			break;
-		default:
-			nparams++;
-		}
-	}
-	if (nparams < 1)
-		nparams = 1;
-
+	/* VS is required to export at least one param. */
+	nparams = MAX2(shader->nr_param_exports, 1);
 	si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG,
 		       S_0286C4_VS_EXPORT_COUNT(nparams - 1));
 
@@ -236,6 +388,9 @@ static void si_shader_vs(struct si_shader *shader)
 			       S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
 			       S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
 			       S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
+
+	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
+		si_set_tesseval_regs(shader, pm4);
 }
 
 static void si_shader_ps(struct si_shader *shader)
@@ -333,7 +488,18 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 
 	switch (shader->selector->type) {
 	case PIPE_SHADER_VERTEX:
-		if (shader->key.vs.as_es)
+		if (shader->key.vs.as_ls)
+			si_shader_ls(shader);
+		else if (shader->key.vs.as_es)
+			si_shader_es(shader);
+		else
+			si_shader_vs(shader);
+		break;
+	case PIPE_SHADER_TESS_CTRL:
+		si_shader_hs(shader);
+		break;
+	case PIPE_SHADER_TESS_EVAL:
+		if (shader->key.tes.as_es)
 			si_shader_es(shader);
 		else
 			si_shader_vs(shader);
@@ -351,7 +517,7 @@ static void si_shader_init_pm4_state(struct si_shader *shader)
 }
 
 /* Compute the key for the hw shader variant */
-static INLINE void si_shader_selector_key(struct pipe_context *ctx,
+static inline void si_shader_selector_key(struct pipe_context *ctx,
 					  struct si_shader_selector *sel,
 					  union si_shader_key *key)
 {
@@ -367,10 +533,27 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx,
 				key->vs.instance_divisors[i] =
 					sctx->vertex_elements->elements[i].instance_divisor;
 
-		if (sctx->gs_shader) {
+		if (sctx->tes_shader)
+			key->vs.as_ls = 1;
+		else if (sctx->gs_shader) {
 			key->vs.as_es = 1;
-			key->vs.gs_used_inputs = sctx->gs_shader->gs_used_inputs;
+			key->vs.es_enabled_outputs = sctx->gs_shader->inputs_read;
 		}
+
+		if (!sctx->gs_shader && sctx->ps_shader &&
+		    sctx->ps_shader->info.uses_primid)
+			key->vs.export_prim_id = 1;
+		break;
+	case PIPE_SHADER_TESS_CTRL:
+		key->tcs.prim_mode =
+			sctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+		break;
+	case PIPE_SHADER_TESS_EVAL:
+		if (sctx->gs_shader) {
+			key->tes.as_es = 1;
+			key->tes.es_enabled_outputs = sctx->gs_shader->inputs_read;
+		} else if (sctx->ps_shader && sctx->ps_shader->info.uses_primid)
+			key->tes.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		break;
@@ -468,6 +651,7 @@ static int si_shader_select(struct pipe_context *ctx,
 		}
 		si_shader_init_pm4_state(shader);
 		sel->num_shaders++;
+		p_atomic_inc(&sctx->screen->b.num_compilations);
 	}
 
 	return 0;
@@ -485,6 +669,7 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 	sel->tokens = tgsi_dup_tokens(state->tokens);
 	sel->so = state->stream_output;
 	tgsi_scan_shader(state->tokens, &sel->info);
+	p_atomic_inc(&sscreen->b.num_shaders_created);
 
 	switch (pipe_shader_type) {
 	case PIPE_SHADER_GEOMETRY:
@@ -492,6 +677,8 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 			sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
 		sel->gs_max_out_vertices =
 			sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+		sel->gs_num_invocations =
+			sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
 
 		for (i = 0; i < sel->info.num_inputs; i++) {
 			unsigned name = sel->info.input_semantic_name[i];
@@ -501,10 +688,31 @@ static void *si_create_shader_state(struct pipe_context *ctx,
 			case TGSI_SEMANTIC_PRIMID:
 				break;
 			default:
-				sel->gs_used_inputs |=
+				sel->inputs_read |=
 					1llu << si_shader_io_get_unique_index(name, index);
 			}
 		}
+		break;
+
+	case PIPE_SHADER_VERTEX:
+	case PIPE_SHADER_TESS_CTRL:
+		for (i = 0; i < sel->info.num_outputs; i++) {
+			unsigned name = sel->info.output_semantic_name[i];
+			unsigned index = sel->info.output_semantic_index[i];
+
+			switch (name) {
+			case TGSI_SEMANTIC_TESSINNER:
+			case TGSI_SEMANTIC_TESSOUTER:
+			case TGSI_SEMANTIC_PATCH:
+				sel->patch_outputs_written |=
+					1llu << si_shader_io_get_unique_index(name, index);
+				break;
+			default:
+				sel->outputs_written |=
+					1llu << si_shader_io_get_unique_index(name, index);
+			}
+		}
+		break;
 	}
 
 	if (sscreen->b.debug_flags & DBG_PRECOMPILE)
@@ -531,6 +739,18 @@ static void *si_create_vs_state(struct pipe_context *ctx,
 	return si_create_shader_state(ctx, state, PIPE_SHADER_VERTEX);
 }
 
+static void *si_create_tcs_state(struct pipe_context *ctx,
+				 const struct pipe_shader_state *state)
+{
+	return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_CTRL);
+}
+
+static void *si_create_tes_state(struct pipe_context *ctx,
+				 const struct pipe_shader_state *state)
+{
+	return si_create_shader_state(ctx, state, PIPE_SHADER_TESS_EVAL);
+}
+
 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -540,20 +760,58 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 		return;
 
 	sctx->vs_shader = sel;
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 }
 
 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->gs_shader != !!sel;
 
 	if (sctx->gs_shader == sel)
 		return;
 
 	sctx->gs_shader = sel;
-	sctx->clip_regs.dirty = true;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+
+	if (enable_changed)
+		si_shader_change_notify(sctx);
+}
+
+static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->tcs_shader != !!sel;
+
+	if (sctx->tcs_shader == sel)
+		return;
+
+	sctx->tcs_shader = sel;
+
+	if (enable_changed)
+		sctx->last_tcs = NULL; /* invalidate derived tess state */
+}
+
+static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->tes_shader != !!sel;
+
+	if (sctx->tes_shader == sel)
+		return;
+
+	sctx->tes_shader = sel;
+	si_mark_atom_dirty(sctx, &sctx->clip_regs);
+	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+
+	if (enable_changed) {
+		si_shader_change_notify(sctx);
+		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
+	}
 }
 
 static void si_make_dummy_ps(struct si_context *sctx)
@@ -594,7 +852,18 @@ static void si_delete_shader_selector(struct pipe_context *ctx,
 		c = p->next_variant;
 		switch (sel->type) {
 		case PIPE_SHADER_VERTEX:
-			if (p->key.vs.as_es)
+			if (p->key.vs.as_ls)
+				si_pm4_delete_state(sctx, ls, p->pm4);
+			else if (p->key.vs.as_es)
+				si_pm4_delete_state(sctx, es, p->pm4);
+			else
+				si_pm4_delete_state(sctx, vs, p->pm4);
+			break;
+		case PIPE_SHADER_TESS_CTRL:
+			si_pm4_delete_state(sctx, hs, p->pm4);
+			break;
+		case PIPE_SHADER_TESS_EVAL:
+			if (p->key.tes.as_es)
 				si_pm4_delete_state(sctx, es, p->pm4);
 			else
 				si_pm4_delete_state(sctx, vs, p->pm4);
@@ -653,6 +922,30 @@ static void si_delete_ps_shader(struct pipe_context *ctx, void *state)
 	si_delete_shader_selector(ctx, sel);
 }
 
+static void si_delete_tcs_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+	if (sctx->tcs_shader == sel) {
+		sctx->tcs_shader = NULL;
+	}
+
+	si_delete_shader_selector(ctx, sel);
+}
+
+static void si_delete_tes_shader(struct pipe_context *ctx, void *state)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+	if (sctx->tes_shader == sel) {
+		sctx->tes_shader = NULL;
+	}
+
+	si_delete_shader_selector(ctx, sel);
+}
+
 static void si_update_spi_map(struct si_context *sctx)
 {
 	struct si_shader *ps = sctx->ps_shader->current;
@@ -694,7 +987,10 @@ bcolor:
 			}
 		}
 
-		if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) {
+		if (name == TGSI_SEMANTIC_PRIMID)
+			/* PrimID is written after the last output. */
+			tmp |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]);
+		else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) {
 			/* No corresponding output found, load defaults into input.
 			 * Don't set any other bits.
 			 * (FLAT_SHADE=1 completely changes behavior) */
@@ -720,7 +1016,7 @@ bcolor:
 static void si_init_gs_rings(struct si_context *sctx)
 {
 	unsigned esgs_ring_size = 128 * 1024;
-	unsigned gsvs_ring_size = 64 * 1024 * 1024;
+	unsigned gsvs_ring_size = 60 * 1024 * 1024;
 
 	assert(!sctx->gs_rings);
 	sctx->gs_rings = CALLOC_STRUCT(si_pm4_state);
@@ -732,6 +1028,12 @@ static void si_init_gs_rings(struct si_context *sctx)
 					     PIPE_USAGE_DEFAULT, gsvs_ring_size);
 
 	if (sctx->b.chip_class >= CIK) {
+		if (sctx->b.chip_class >= VI) {
+			/* The maximum sizes are 63.999 MB on VI, because
+			 * the register fields only have 18 bits. */
+			assert(esgs_ring_size / 256 < (1 << 18));
+			assert(gsvs_ring_size / 256 < (1 << 18));
+		}
 		si_pm4_set_reg(sctx->gs_rings, R_030900_VGT_ESGS_RING_SIZE,
 			       esgs_ring_size / 256);
 		si_pm4_set_reg(sctx->gs_rings, R_030904_VGT_GSVS_RING_SIZE,
@@ -745,15 +1047,42 @@ static void si_init_gs_rings(struct si_context *sctx)
 
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
 			   sctx->esgs_ring, 0, esgs_ring_size,
-			   true, true, 4, 64);
+			   true, true, 4, 64, 0);
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
 			   sctx->esgs_ring, 0, esgs_ring_size,
-			   false, false, 0, 0);
+			   false, false, 0, 0, 0);
 	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
 			   sctx->gsvs_ring, 0, gsvs_ring_size,
-			   false, false, 0, 0);
+			   false, false, 0, 0, 0);
 }
 
+static void si_update_gs_rings(struct si_context *sctx)
+{
+	unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16;
+	unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices;
+	unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
+	uint64_t offset;
+
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, 0);
+
+	offset = gsvs_itemsize * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+	offset = (gsvs_itemsize * 2) * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+	offset = (gsvs_itemsize * 3) * 64;
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3,
+			   sctx->gsvs_ring, gsvs_itemsize,
+			   64, true, true, 4, 16, offset);
+
+}
 /**
  * @returns 1 if \p sel has been updated to use a new scratch buffer and 0
  *          otherwise.
@@ -763,7 +1092,6 @@ static unsigned si_update_scratch_buffer(struct si_context *sctx,
 {
 	struct si_shader *shader;
 	uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
-	unsigned char *ptr;
 
 	if (!sel)
 		return 0;
@@ -784,12 +1112,7 @@ static unsigned si_update_scratch_buffer(struct si_context *sctx,
 	si_shader_apply_scratch_relocs(sctx, shader, scratch_va);
 
 	/* Replace the shader bo with a new bo that has the relocs applied. */
-	r600_resource_reference(&shader->bo, NULL);
-	shader->bo = si_resource_create_custom(&sctx->screen->b.b, PIPE_USAGE_IMMUTABLE,
-					       shader->binary.code_size);
-	ptr = sctx->screen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE);
-	util_memcpy_cpu_to_le32(ptr, shader->binary.code, shader->binary.code_size);
-	sctx->screen->b.ws->buffer_unmap(shader->bo->cs_buf);
+	si_shader_binary_upload(sctx->screen, shader);
 
 	/* Update the shader state to use the new shader bo. */
 	si_shader_init_pm4_state(shader);
@@ -818,10 +1141,14 @@ static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx,
 
 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
 {
+	unsigned bytes = 0;
 
-	return MAX3(si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader),
-			si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader),
-			si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tcs_shader));
+	bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx, sctx->tes_shader));
+	return bytes;
 }
 
 static void si_update_spi_tmpring_size(struct si_context *sctx)
@@ -855,15 +1182,29 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 			si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
 		if (si_update_scratch_buffer(sctx, sctx->gs_shader))
 			si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
+		if (si_update_scratch_buffer(sctx, sctx->tcs_shader))
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
 
-		/* VS can be bound as ES or VS. */
-		if (sctx->gs_shader) {
+		/* VS can be bound as LS, ES, or VS. */
+		if (sctx->tes_shader) {
+			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+				si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
+		} else if (sctx->gs_shader) {
 			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
 				si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
 		} else {
 			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
 				si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
 		}
+
+		/* TES can be bound as ES or VS. */
+		if (sctx->gs_shader) {
+			if (si_update_scratch_buffer(sctx, sctx->tes_shader))
+				si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+		} else {
+			if (si_update_scratch_buffer(sctx, sctx->tes_shader))
+				si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
+		}
 	}
 
 	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
@@ -874,60 +1215,187 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 				S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
 }
 
+static void si_init_tess_factor_ring(struct si_context *sctx)
+{
+	assert(!sctx->tf_state);
+	sctx->tf_state = CALLOC_STRUCT(si_pm4_state);
+
+	sctx->tf_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+					   PIPE_USAGE_DEFAULT,
+					   32768 * sctx->screen->b.info.max_se);
+	sctx->b.clear_buffer(&sctx->b.b, sctx->tf_ring, 0,
+			     sctx->tf_ring->width0, fui(0), false);
+	assert(((sctx->tf_ring->width0 / 4) & C_030938_SIZE) == 0);
+
+	if (sctx->b.chip_class >= CIK) {
+		si_pm4_set_reg(sctx->tf_state, R_030938_VGT_TF_RING_SIZE,
+			       S_030938_SIZE(sctx->tf_ring->width0 / 4));
+		si_pm4_set_reg(sctx->tf_state, R_030940_VGT_TF_MEMORY_BASE,
+			       r600_resource(sctx->tf_ring)->gpu_address >> 8);
+	} else {
+		si_pm4_set_reg(sctx->tf_state, R_008988_VGT_TF_RING_SIZE,
+			       S_008988_SIZE(sctx->tf_ring->width0 / 4));
+		si_pm4_set_reg(sctx->tf_state, R_0089B8_VGT_TF_MEMORY_BASE,
+			       r600_resource(sctx->tf_ring)->gpu_address >> 8);
+	}
+	si_pm4_add_bo(sctx->tf_state, r600_resource(sctx->tf_ring),
+		      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
+	si_pm4_bind_state(sctx, tf_ring, sctx->tf_state);
+
+	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL,
+			   SI_RING_TESS_FACTOR, sctx->tf_ring, 0,
+			   sctx->tf_ring->width0, false, false, 0, 0, 0);
+
+	sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
+}
+
+/**
+ * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
+ * VS passes its outputs to TES directly, so the fixed-function shader only
+ * has to write TESSOUTER and TESSINNER.
+ */
+static void si_generate_fixed_func_tcs(struct si_context *sctx)
+{
+	struct ureg_src const0, const1;
+	struct ureg_dst tessouter, tessinner;
+	struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL);
+
+	if (!ureg)
+		return; /* if we get here, we're screwed */
+
+	assert(!sctx->fixed_func_tcs_shader);
+
+	ureg_DECL_constant2D(ureg, 0, 1, SI_DRIVER_STATE_CONST_BUF);
+	const0 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 0),
+				    SI_DRIVER_STATE_CONST_BUF);
+	const1 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 1),
+				    SI_DRIVER_STATE_CONST_BUF);
+
+	tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
+	tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
+
+	ureg_MOV(ureg, tessouter, const0);
+	ureg_MOV(ureg, tessinner, const1);
+	ureg_END(ureg);
+
+	sctx->fixed_func_tcs_shader =
+		ureg_create_shader_and_destroy(ureg, &sctx->b.b);
+	assert(sctx->fixed_func_tcs_shader);
+}
+
+static void si_update_vgt_shader_config(struct si_context *sctx)
+{
+	/* Calculate the index of the config.
+	 * 0 = VS, 1 = VS+GS, 2 = VS+Tess, 3 = VS+Tess+GS */
+	unsigned index = 2*!!sctx->tes_shader + !!sctx->gs_shader;
+	struct si_pm4_state **pm4 = &sctx->vgt_shader_config[index];
+
+	if (!*pm4) {
+		uint32_t stages = 0;
+
+		*pm4 = CALLOC_STRUCT(si_pm4_state);
+
+		if (sctx->tes_shader) {
+			stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
+				  S_028B54_HS_EN(1);
+
+			if (sctx->gs_shader)
+				stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
+					  S_028B54_GS_EN(1) |
+				          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+			else
+				stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+		} else if (sctx->gs_shader) {
+			stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
+				  S_028B54_GS_EN(1) |
+			          S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+		}
+
+		si_pm4_set_reg(*pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
+	}
+	si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
+}
+
+static void si_update_so(struct si_context *sctx, struct si_shader_selector *shader)
+{
+	struct pipe_stream_output_info *so = &shader->so;
+	uint32_t enabled_stream_buffers_mask = 0;
+	int i;
+
+	for (i = 0; i < so->num_outputs; i++)
+		enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << (so->output[i].stream * 4);
+	sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask;
+	sctx->b.streamout.stride_in_dw = shader->so.stride;
+}
+
 void si_update_shaders(struct si_context *sctx)
 {
 	struct pipe_context *ctx = (struct pipe_context*)sctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
+	/* Update stages before GS. */
+	if (sctx->tes_shader) {
+		if (!sctx->tf_state)
+			si_init_tess_factor_ring(sctx);
+
+		/* VS as LS */
+		si_shader_select(ctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, ls, sctx->vs_shader->current->pm4);
+
+		if (sctx->tcs_shader) {
+			si_shader_select(ctx, sctx->tcs_shader);
+			si_pm4_bind_state(sctx, hs, sctx->tcs_shader->current->pm4);
+		} else {
+			if (!sctx->fixed_func_tcs_shader)
+				si_generate_fixed_func_tcs(sctx);
+			si_shader_select(ctx, sctx->fixed_func_tcs_shader);
+			si_pm4_bind_state(sctx, hs,
+					  sctx->fixed_func_tcs_shader->current->pm4);
+		}
+
+		si_shader_select(ctx, sctx->tes_shader);
+		if (sctx->gs_shader) {
+			/* TES as ES */
+			si_pm4_bind_state(sctx, es, sctx->tes_shader->current->pm4);
+		} else {
+			/* TES as VS */
+			si_pm4_bind_state(sctx, vs, sctx->tes_shader->current->pm4);
+			si_update_so(sctx, sctx->tes_shader);
+		}
+	} else if (sctx->gs_shader) {
+		/* VS as ES */
+		si_shader_select(ctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+	} else {
+		/* VS as VS */
+		si_shader_select(ctx, sctx->vs_shader);
+		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+		si_update_so(sctx, sctx->vs_shader);
+	}
+
+	/* Update GS. */
 	if (sctx->gs_shader) {
 		si_shader_select(ctx, sctx->gs_shader);
 		si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
 		si_pm4_bind_state(sctx, vs, sctx->gs_shader->current->gs_copy_shader->pm4);
-
-		sctx->b.streamout.stride_in_dw = sctx->gs_shader->so.stride;
-
-		si_shader_select(ctx, sctx->vs_shader);
-		si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+		si_update_so(sctx, sctx->gs_shader);
 
 		if (!sctx->gs_rings)
 			si_init_gs_rings(sctx);
+
 		if (sctx->emitted.named.gs_rings != sctx->gs_rings)
 			sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 		si_pm4_bind_state(sctx, gs_rings, sctx->gs_rings);
 
-		si_set_ring_buffer(ctx, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
-				   sctx->gsvs_ring,
-				   sctx->gs_shader->gs_max_out_vertices *
-				   sctx->gs_shader->info.num_outputs * 16,
-				   64, true, true, 4, 16);
-
-		if (!sctx->gs_on) {
-			sctx->gs_on = CALLOC_STRUCT(si_pm4_state);
-
-			si_pm4_set_reg(sctx->gs_on, R_028B54_VGT_SHADER_STAGES_EN,
-				       S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
-				       S_028B54_GS_EN(1) |
-				       S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER));
-		}
-		si_pm4_bind_state(sctx, gs_onoff, sctx->gs_on);
+		si_update_gs_rings(sctx);
 	} else {
-		si_shader_select(ctx, sctx->vs_shader);
-		si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
-
-		sctx->b.streamout.stride_in_dw = sctx->vs_shader->so.stride;
-
-		if (!sctx->gs_off) {
-			sctx->gs_off = CALLOC_STRUCT(si_pm4_state);
-
-			si_pm4_set_reg(sctx->gs_off, R_028A40_VGT_GS_MODE, 0);
-			si_pm4_set_reg(sctx->gs_off, R_028B54_VGT_SHADER_STAGES_EN, 0);
-		}
-		si_pm4_bind_state(sctx, gs_onoff, sctx->gs_off);
 		si_pm4_bind_state(sctx, gs_rings, NULL);
 		si_pm4_bind_state(sctx, gs, NULL);
 		si_pm4_bind_state(sctx, es, NULL);
 	}
 
+	si_update_vgt_shader_config(sctx);
+
 	si_shader_select(ctx, sctx->ps_shader);
 
 	if (!sctx->ps_shader->current) {
@@ -957,29 +1425,35 @@ void si_update_shaders(struct si_context *sctx)
 
 	if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) {
 		sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control;
-		sctx->db_render_state.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 
 	if (sctx->smoothing_enabled != sctx->ps_shader->current->key.ps.poly_line_smoothing) {
 		sctx->smoothing_enabled = sctx->ps_shader->current->key.ps.poly_line_smoothing;
-		sctx->msaa_config.dirty = true;
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
 		if (sctx->b.chip_class == SI)
-			sctx->db_render_state.dirty = true;
+			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
 }
 
 void si_init_shader_functions(struct si_context *sctx)
 {
 	sctx->b.b.create_vs_state = si_create_vs_state;
+	sctx->b.b.create_tcs_state = si_create_tcs_state;
+	sctx->b.b.create_tes_state = si_create_tes_state;
 	sctx->b.b.create_gs_state = si_create_gs_state;
 	sctx->b.b.create_fs_state = si_create_fs_state;
 
 	sctx->b.b.bind_vs_state = si_bind_vs_shader;
+	sctx->b.b.bind_tcs_state = si_bind_tcs_shader;
+	sctx->b.b.bind_tes_state = si_bind_tes_shader;
 	sctx->b.b.bind_gs_state = si_bind_gs_shader;
 	sctx->b.b.bind_fs_state = si_bind_ps_shader;
 
 	sctx->b.b.delete_vs_state = si_delete_vs_shader;
+	sctx->b.b.delete_tcs_state = si_delete_tcs_shader;
+	sctx->b.b.delete_tes_state = si_delete_tes_shader;
 	sctx->b.b.delete_gs_state = si_delete_gs_shader;
 	sctx->b.b.delete_fs_state = si_delete_ps_shader;
 }
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 35d5ee232a0..66fdf35c8af 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -206,6 +206,398 @@
  * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
  */
 
+
+#define R_000E4C_SRBM_STATUS2                                           0x000E4C
+#define   S_000E4C_SDMA_RQ_PENDING(x)                                 (((x) & 0x1) << 0)
+#define   G_000E4C_SDMA_RQ_PENDING(x)                                 (((x) >> 0) & 0x1)
+#define   C_000E4C_SDMA_RQ_PENDING                                    0xFFFFFFFE
+#define   S_000E4C_TST_RQ_PENDING(x)                                  (((x) & 0x1) << 1)
+#define   G_000E4C_TST_RQ_PENDING(x)                                  (((x) >> 1) & 0x1)
+#define   C_000E4C_TST_RQ_PENDING                                     0xFFFFFFFD
+#define   S_000E4C_SDMA1_RQ_PENDING(x)                                (((x) & 0x1) << 2)
+#define   G_000E4C_SDMA1_RQ_PENDING(x)                                (((x) >> 2) & 0x1)
+#define   C_000E4C_SDMA1_RQ_PENDING                                   0xFFFFFFFB
+#define   S_000E4C_VCE0_RQ_PENDING(x)                                 (((x) & 0x1) << 3)
+#define   G_000E4C_VCE0_RQ_PENDING(x)                                 (((x) >> 3) & 0x1)
+#define   C_000E4C_VCE0_RQ_PENDING                                    0xFFFFFFF7
+#define   S_000E4C_VP8_BUSY(x)                                        (((x) & 0x1) << 4)
+#define   G_000E4C_VP8_BUSY(x)                                        (((x) >> 4) & 0x1)
+#define   C_000E4C_VP8_BUSY                                           0xFFFFFFEF
+#define   S_000E4C_SDMA_BUSY(x)                                       (((x) & 0x1) << 5)
+#define   G_000E4C_SDMA_BUSY(x)                                       (((x) >> 5) & 0x1)
+#define   C_000E4C_SDMA_BUSY                                          0xFFFFFFDF
+#define   S_000E4C_SDMA1_BUSY(x)                                      (((x) & 0x1) << 6)
+#define   G_000E4C_SDMA1_BUSY(x)                                      (((x) >> 6) & 0x1)
+#define   C_000E4C_SDMA1_BUSY                                         0xFFFFFFBF
+#define   S_000E4C_VCE0_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_000E4C_VCE0_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_000E4C_VCE0_BUSY                                          0xFFFFFF7F
+#define   S_000E4C_XDMA_BUSY(x)                                       (((x) & 0x1) << 8)
+#define   G_000E4C_XDMA_BUSY(x)                                       (((x) >> 8) & 0x1)
+#define   C_000E4C_XDMA_BUSY                                          0xFFFFFEFF
+#define   S_000E4C_CHUB_BUSY(x)                                       (((x) & 0x1) << 9)
+#define   G_000E4C_CHUB_BUSY(x)                                       (((x) >> 9) & 0x1)
+#define   C_000E4C_CHUB_BUSY                                          0xFFFFFDFF
+#define   S_000E4C_SDMA2_BUSY(x)                                      (((x) & 0x1) << 10)
+#define   G_000E4C_SDMA2_BUSY(x)                                      (((x) >> 10) & 0x1)
+#define   C_000E4C_SDMA2_BUSY                                         0xFFFFFBFF
+#define   S_000E4C_SDMA3_BUSY(x)                                      (((x) & 0x1) << 11)
+#define   G_000E4C_SDMA3_BUSY(x)                                      (((x) >> 11) & 0x1)
+#define   C_000E4C_SDMA3_BUSY                                         0xFFFFF7FF
+#define   S_000E4C_SAMSCP_BUSY(x)                                     (((x) & 0x1) << 12)
+#define   G_000E4C_SAMSCP_BUSY(x)                                     (((x) >> 12) & 0x1)
+#define   C_000E4C_SAMSCP_BUSY                                        0xFFFFEFFF
+#define   S_000E4C_ISP_BUSY(x)                                        (((x) & 0x1) << 13)
+#define   G_000E4C_ISP_BUSY(x)                                        (((x) >> 13) & 0x1)
+#define   C_000E4C_ISP_BUSY                                           0xFFFFDFFF
+#define   S_000E4C_VCE1_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_000E4C_VCE1_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_000E4C_VCE1_BUSY                                          0xFFFFBFFF
+#define   S_000E4C_ODE_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_000E4C_ODE_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_000E4C_ODE_BUSY                                           0xFFFF7FFF
+#define   S_000E4C_SDMA2_RQ_PENDING(x)                                (((x) & 0x1) << 16)
+#define   G_000E4C_SDMA2_RQ_PENDING(x)                                (((x) >> 16) & 0x1)
+#define   C_000E4C_SDMA2_RQ_PENDING                                   0xFFFEFFFF
+#define   S_000E4C_SDMA3_RQ_PENDING(x)                                (((x) & 0x1) << 17)
+#define   G_000E4C_SDMA3_RQ_PENDING(x)                                (((x) >> 17) & 0x1)
+#define   C_000E4C_SDMA3_RQ_PENDING                                   0xFFFDFFFF
+#define   S_000E4C_SAMSCP_RQ_PENDING(x)                               (((x) & 0x1) << 18)
+#define   G_000E4C_SAMSCP_RQ_PENDING(x)                               (((x) >> 18) & 0x1)
+#define   C_000E4C_SAMSCP_RQ_PENDING                                  0xFFFBFFFF
+#define   S_000E4C_ISP_RQ_PENDING(x)                                  (((x) & 0x1) << 19)
+#define   G_000E4C_ISP_RQ_PENDING(x)                                  (((x) >> 19) & 0x1)
+#define   C_000E4C_ISP_RQ_PENDING                                     0xFFF7FFFF
+#define   S_000E4C_VCE1_RQ_PENDING(x)                                 (((x) & 0x1) << 20)
+#define   G_000E4C_VCE1_RQ_PENDING(x)                                 (((x) >> 20) & 0x1)
+#define   C_000E4C_VCE1_RQ_PENDING                                    0xFFEFFFFF
+#define R_000E50_SRBM_STATUS                                            0x000E50
+#define   S_000E50_UVD_RQ_PENDING(x)                                  (((x) & 0x1) << 1)
+#define   G_000E50_UVD_RQ_PENDING(x)                                  (((x) >> 1) & 0x1)
+#define   C_000E50_UVD_RQ_PENDING                                     0xFFFFFFFD
+#define   S_000E50_SAMMSP_RQ_PENDING(x)                               (((x) & 0x1) << 2)
+#define   G_000E50_SAMMSP_RQ_PENDING(x)                               (((x) >> 2) & 0x1)
+#define   C_000E50_SAMMSP_RQ_PENDING                                  0xFFFFFFFB
+#define   S_000E50_ACP_RQ_PENDING(x)                                  (((x) & 0x1) << 3)
+#define   G_000E50_ACP_RQ_PENDING(x)                                  (((x) >> 3) & 0x1)
+#define   C_000E50_ACP_RQ_PENDING                                     0xFFFFFFF7
+#define   S_000E50_SMU_RQ_PENDING(x)                                  (((x) & 0x1) << 4)
+#define   G_000E50_SMU_RQ_PENDING(x)                                  (((x) >> 4) & 0x1)
+#define   C_000E50_SMU_RQ_PENDING                                     0xFFFFFFEF
+#define   S_000E50_GRBM_RQ_PENDING(x)                                 (((x) & 0x1) << 5)
+#define   G_000E50_GRBM_RQ_PENDING(x)                                 (((x) >> 5) & 0x1)
+#define   C_000E50_GRBM_RQ_PENDING                                    0xFFFFFFDF
+#define   S_000E50_HI_RQ_PENDING(x)                                   (((x) & 0x1) << 6)
+#define   G_000E50_HI_RQ_PENDING(x)                                   (((x) >> 6) & 0x1)
+#define   C_000E50_HI_RQ_PENDING                                      0xFFFFFFBF
+#define   S_000E50_VMC_BUSY(x)                                        (((x) & 0x1) << 8)
+#define   G_000E50_VMC_BUSY(x)                                        (((x) >> 8) & 0x1)
+#define   C_000E50_VMC_BUSY                                           0xFFFFFEFF
+#define   S_000E50_MCB_BUSY(x)                                        (((x) & 0x1) << 9)
+#define   G_000E50_MCB_BUSY(x)                                        (((x) >> 9) & 0x1)
+#define   C_000E50_MCB_BUSY                                           0xFFFFFDFF
+#define   S_000E50_MCB_NON_DISPLAY_BUSY(x)                            (((x) & 0x1) << 10)
+#define   G_000E50_MCB_NON_DISPLAY_BUSY(x)                            (((x) >> 10) & 0x1)
+#define   C_000E50_MCB_NON_DISPLAY_BUSY                               0xFFFFFBFF
+#define   S_000E50_MCC_BUSY(x)                                        (((x) & 0x1) << 11)
+#define   G_000E50_MCC_BUSY(x)                                        (((x) >> 11) & 0x1)
+#define   C_000E50_MCC_BUSY                                           0xFFFFF7FF
+#define   S_000E50_MCD_BUSY(x)                                        (((x) & 0x1) << 12)
+#define   G_000E50_MCD_BUSY(x)                                        (((x) >> 12) & 0x1)
+#define   C_000E50_MCD_BUSY                                           0xFFFFEFFF
+#define   S_000E50_VMC1_BUSY(x)                                       (((x) & 0x1) << 13)
+#define   G_000E50_VMC1_BUSY(x)                                       (((x) >> 13) & 0x1)
+#define   C_000E50_VMC1_BUSY                                          0xFFFFDFFF
+#define   S_000E50_SEM_BUSY(x)                                        (((x) & 0x1) << 14)
+#define   G_000E50_SEM_BUSY(x)                                        (((x) >> 14) & 0x1)
+#define   C_000E50_SEM_BUSY                                           0xFFFFBFFF
+#define   S_000E50_ACP_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_000E50_ACP_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_000E50_ACP_BUSY                                           0xFFFEFFFF
+#define   S_000E50_IH_BUSY(x)                                         (((x) & 0x1) << 17)
+#define   G_000E50_IH_BUSY(x)                                         (((x) >> 17) & 0x1)
+#define   C_000E50_IH_BUSY                                            0xFFFDFFFF
+#define   S_000E50_UVD_BUSY(x)                                        (((x) & 0x1) << 19)
+#define   G_000E50_UVD_BUSY(x)                                        (((x) >> 19) & 0x1)
+#define   C_000E50_UVD_BUSY                                           0xFFF7FFFF
+#define   S_000E50_SAMMSP_BUSY(x)                                     (((x) & 0x1) << 20)
+#define   G_000E50_SAMMSP_BUSY(x)                                     (((x) >> 20) & 0x1)
+#define   C_000E50_SAMMSP_BUSY                                        0xFFEFFFFF
+#define   S_000E50_GCATCL2_BUSY(x)                                    (((x) & 0x1) << 21)
+#define   G_000E50_GCATCL2_BUSY(x)                                    (((x) >> 21) & 0x1)
+#define   C_000E50_GCATCL2_BUSY                                       0xFFDFFFFF
+#define   S_000E50_OSATCL2_BUSY(x)                                    (((x) & 0x1) << 22)
+#define   G_000E50_OSATCL2_BUSY(x)                                    (((x) >> 22) & 0x1)
+#define   C_000E50_OSATCL2_BUSY                                       0xFFBFFFFF
+#define   S_000E50_BIF_BUSY(x)                                        (((x) & 0x1) << 29)
+#define   G_000E50_BIF_BUSY(x)                                        (((x) >> 29) & 0x1)
+#define   C_000E50_BIF_BUSY                                           0xDFFFFFFF
+#define R_000E54_SRBM_STATUS3                                           0x000E54
+#define   S_000E54_MCC0_BUSY(x)                                       (((x) & 0x1) << 0)
+#define   G_000E54_MCC0_BUSY(x)                                       (((x) >> 0) & 0x1)
+#define   C_000E54_MCC0_BUSY                                          0xFFFFFFFE
+#define   S_000E54_MCC1_BUSY(x)                                       (((x) & 0x1) << 1)
+#define   G_000E54_MCC1_BUSY(x)                                       (((x) >> 1) & 0x1)
+#define   C_000E54_MCC1_BUSY                                          0xFFFFFFFD
+#define   S_000E54_MCC2_BUSY(x)                                       (((x) & 0x1) << 2)
+#define   G_000E54_MCC2_BUSY(x)                                       (((x) >> 2) & 0x1)
+#define   C_000E54_MCC2_BUSY                                          0xFFFFFFFB
+#define   S_000E54_MCC3_BUSY(x)                                       (((x) & 0x1) << 3)
+#define   G_000E54_MCC3_BUSY(x)                                       (((x) >> 3) & 0x1)
+#define   C_000E54_MCC3_BUSY                                          0xFFFFFFF7
+#define   S_000E54_MCC4_BUSY(x)                                       (((x) & 0x1) << 4)
+#define   G_000E54_MCC4_BUSY(x)                                       (((x) >> 4) & 0x1)
+#define   C_000E54_MCC4_BUSY                                          0xFFFFFFEF
+#define   S_000E54_MCC5_BUSY(x)                                       (((x) & 0x1) << 5)
+#define   G_000E54_MCC5_BUSY(x)                                       (((x) >> 5) & 0x1)
+#define   C_000E54_MCC5_BUSY                                          0xFFFFFFDF
+#define   S_000E54_MCC6_BUSY(x)                                       (((x) & 0x1) << 6)
+#define   G_000E54_MCC6_BUSY(x)                                       (((x) >> 6) & 0x1)
+#define   C_000E54_MCC6_BUSY                                          0xFFFFFFBF
+#define   S_000E54_MCC7_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_000E54_MCC7_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_000E54_MCC7_BUSY                                          0xFFFFFF7F
+#define   S_000E54_MCD0_BUSY(x)                                       (((x) & 0x1) << 8)
+#define   G_000E54_MCD0_BUSY(x)                                       (((x) >> 8) & 0x1)
+#define   C_000E54_MCD0_BUSY                                          0xFFFFFEFF
+#define   S_000E54_MCD1_BUSY(x)                                       (((x) & 0x1) << 9)
+#define   G_000E54_MCD1_BUSY(x)                                       (((x) >> 9) & 0x1)
+#define   C_000E54_MCD1_BUSY                                          0xFFFFFDFF
+#define   S_000E54_MCD2_BUSY(x)                                       (((x) & 0x1) << 10)
+#define   G_000E54_MCD2_BUSY(x)                                       (((x) >> 10) & 0x1)
+#define   C_000E54_MCD2_BUSY                                          0xFFFFFBFF
+#define   S_000E54_MCD3_BUSY(x)                                       (((x) & 0x1) << 11)
+#define   G_000E54_MCD3_BUSY(x)                                       (((x) >> 11) & 0x1)
+#define   C_000E54_MCD3_BUSY                                          0xFFFFF7FF
+#define   S_000E54_MCD4_BUSY(x)                                       (((x) & 0x1) << 12)
+#define   G_000E54_MCD4_BUSY(x)                                       (((x) >> 12) & 0x1)
+#define   C_000E54_MCD4_BUSY                                          0xFFFFEFFF
+#define   S_000E54_MCD5_BUSY(x)                                       (((x) & 0x1) << 13)
+#define   G_000E54_MCD5_BUSY(x)                                       (((x) >> 13) & 0x1)
+#define   C_000E54_MCD5_BUSY                                          0xFFFFDFFF
+#define   S_000E54_MCD6_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_000E54_MCD6_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_000E54_MCD6_BUSY                                          0xFFFFBFFF
+#define   S_000E54_MCD7_BUSY(x)                                       (((x) & 0x1) << 15)
+#define   G_000E54_MCD7_BUSY(x)                                       (((x) >> 15) & 0x1)
+#define   C_000E54_MCD7_BUSY                                          0xFFFF7FFF
+#define R_00D034_SDMA0_STATUS_REG                                       0x00D034
+#define   S_00D034_IDLE(x)                                            (((x) & 0x1) << 0)
+#define   G_00D034_IDLE(x)                                            (((x) >> 0) & 0x1)
+#define   C_00D034_IDLE                                               0xFFFFFFFE
+#define   S_00D034_REG_IDLE(x)                                        (((x) & 0x1) << 1)
+#define   G_00D034_REG_IDLE(x)                                        (((x) >> 1) & 0x1)
+#define   C_00D034_REG_IDLE                                           0xFFFFFFFD
+#define   S_00D034_RB_EMPTY(x)                                        (((x) & 0x1) << 2)
+#define   G_00D034_RB_EMPTY(x)                                        (((x) >> 2) & 0x1)
+#define   C_00D034_RB_EMPTY                                           0xFFFFFFFB
+#define   S_00D034_RB_FULL(x)                                         (((x) & 0x1) << 3)
+#define   G_00D034_RB_FULL(x)                                         (((x) >> 3) & 0x1)
+#define   C_00D034_RB_FULL                                            0xFFFFFFF7
+#define   S_00D034_RB_CMD_IDLE(x)                                     (((x) & 0x1) << 4)
+#define   G_00D034_RB_CMD_IDLE(x)                                     (((x) >> 4) & 0x1)
+#define   C_00D034_RB_CMD_IDLE                                        0xFFFFFFEF
+#define   S_00D034_RB_CMD_FULL(x)                                     (((x) & 0x1) << 5)
+#define   G_00D034_RB_CMD_FULL(x)                                     (((x) >> 5) & 0x1)
+#define   C_00D034_RB_CMD_FULL                                        0xFFFFFFDF
+#define   S_00D034_IB_CMD_IDLE(x)                                     (((x) & 0x1) << 6)
+#define   G_00D034_IB_CMD_IDLE(x)                                     (((x) >> 6) & 0x1)
+#define   C_00D034_IB_CMD_IDLE                                        0xFFFFFFBF
+#define   S_00D034_IB_CMD_FULL(x)                                     (((x) & 0x1) << 7)
+#define   G_00D034_IB_CMD_FULL(x)                                     (((x) >> 7) & 0x1)
+#define   C_00D034_IB_CMD_FULL                                        0xFFFFFF7F
+#define   S_00D034_BLOCK_IDLE(x)                                      (((x) & 0x1) << 8)
+#define   G_00D034_BLOCK_IDLE(x)                                      (((x) >> 8) & 0x1)
+#define   C_00D034_BLOCK_IDLE                                         0xFFFFFEFF
+#define   S_00D034_INSIDE_IB(x)                                       (((x) & 0x1) << 9)
+#define   G_00D034_INSIDE_IB(x)                                       (((x) >> 9) & 0x1)
+#define   C_00D034_INSIDE_IB                                          0xFFFFFDFF
+#define   S_00D034_EX_IDLE(x)                                         (((x) & 0x1) << 10)
+#define   G_00D034_EX_IDLE(x)                                         (((x) >> 10) & 0x1)
+#define   C_00D034_EX_IDLE                                            0xFFFFFBFF
+#define   S_00D034_EX_IDLE_POLL_TIMER_EXPIRE(x)                       (((x) & 0x1) << 11)
+#define   G_00D034_EX_IDLE_POLL_TIMER_EXPIRE(x)                       (((x) >> 11) & 0x1)
+#define   C_00D034_EX_IDLE_POLL_TIMER_EXPIRE                          0xFFFFF7FF
+#define   S_00D034_PACKET_READY(x)                                    (((x) & 0x1) << 12)
+#define   G_00D034_PACKET_READY(x)                                    (((x) >> 12) & 0x1)
+#define   C_00D034_PACKET_READY                                       0xFFFFEFFF
+#define   S_00D034_MC_WR_IDLE(x)                                      (((x) & 0x1) << 13)
+#define   G_00D034_MC_WR_IDLE(x)                                      (((x) >> 13) & 0x1)
+#define   C_00D034_MC_WR_IDLE                                         0xFFFFDFFF
+#define   S_00D034_SRBM_IDLE(x)                                       (((x) & 0x1) << 14)
+#define   G_00D034_SRBM_IDLE(x)                                       (((x) >> 14) & 0x1)
+#define   C_00D034_SRBM_IDLE                                          0xFFFFBFFF
+#define   S_00D034_CONTEXT_EMPTY(x)                                   (((x) & 0x1) << 15)
+#define   G_00D034_CONTEXT_EMPTY(x)                                   (((x) >> 15) & 0x1)
+#define   C_00D034_CONTEXT_EMPTY                                      0xFFFF7FFF
+#define   S_00D034_DELTA_RPTR_FULL(x)                                 (((x) & 0x1) << 16)
+#define   G_00D034_DELTA_RPTR_FULL(x)                                 (((x) >> 16) & 0x1)
+#define   C_00D034_DELTA_RPTR_FULL                                    0xFFFEFFFF
+#define   S_00D034_RB_MC_RREQ_IDLE(x)                                 (((x) & 0x1) << 17)
+#define   G_00D034_RB_MC_RREQ_IDLE(x)                                 (((x) >> 17) & 0x1)
+#define   C_00D034_RB_MC_RREQ_IDLE                                    0xFFFDFFFF
+#define   S_00D034_IB_MC_RREQ_IDLE(x)                                 (((x) & 0x1) << 18)
+#define   G_00D034_IB_MC_RREQ_IDLE(x)                                 (((x) >> 18) & 0x1)
+#define   C_00D034_IB_MC_RREQ_IDLE                                    0xFFFBFFFF
+#define   S_00D034_MC_RD_IDLE(x)                                      (((x) & 0x1) << 19)
+#define   G_00D034_MC_RD_IDLE(x)                                      (((x) >> 19) & 0x1)
+#define   C_00D034_MC_RD_IDLE                                         0xFFF7FFFF
+#define   S_00D034_DELTA_RPTR_EMPTY(x)                                (((x) & 0x1) << 20)
+#define   G_00D034_DELTA_RPTR_EMPTY(x)                                (((x) >> 20) & 0x1)
+#define   C_00D034_DELTA_RPTR_EMPTY                                   0xFFEFFFFF
+#define   S_00D034_MC_RD_RET_STALL(x)                                 (((x) & 0x1) << 21)
+#define   G_00D034_MC_RD_RET_STALL(x)                                 (((x) >> 21) & 0x1)
+#define   C_00D034_MC_RD_RET_STALL                                    0xFFDFFFFF
+#define   S_00D034_MC_RD_NO_POLL_IDLE(x)                              (((x) & 0x1) << 22)
+#define   G_00D034_MC_RD_NO_POLL_IDLE(x)                              (((x) >> 22) & 0x1)
+#define   C_00D034_MC_RD_NO_POLL_IDLE                                 0xFFBFFFFF
+#define   S_00D034_PREV_CMD_IDLE(x)                                   (((x) & 0x1) << 25)
+#define   G_00D034_PREV_CMD_IDLE(x)                                   (((x) >> 25) & 0x1)
+#define   C_00D034_PREV_CMD_IDLE                                      0xFDFFFFFF
+#define   S_00D034_SEM_IDLE(x)                                        (((x) & 0x1) << 26)
+#define   G_00D034_SEM_IDLE(x)                                        (((x) >> 26) & 0x1)
+#define   C_00D034_SEM_IDLE                                           0xFBFFFFFF
+#define   S_00D034_SEM_REQ_STALL(x)                                   (((x) & 0x1) << 27)
+#define   G_00D034_SEM_REQ_STALL(x)                                   (((x) >> 27) & 0x1)
+#define   C_00D034_SEM_REQ_STALL                                      0xF7FFFFFF
+#define   S_00D034_SEM_RESP_STATE(x)                                  (((x) & 0x03) << 28)
+#define   G_00D034_SEM_RESP_STATE(x)                                  (((x) >> 28) & 0x03)
+#define   C_00D034_SEM_RESP_STATE                                     0xCFFFFFFF
+#define   S_00D034_INT_IDLE(x)                                        (((x) & 0x1) << 30)
+#define   G_00D034_INT_IDLE(x)                                        (((x) >> 30) & 0x1)
+#define   C_00D034_INT_IDLE                                           0xBFFFFFFF
+#define   S_00D034_INT_REQ_STALL(x)                                   (((x) & 0x1) << 31)
+#define   G_00D034_INT_REQ_STALL(x)                                   (((x) >> 31) & 0x1)
+#define   C_00D034_INT_REQ_STALL                                      0x7FFFFFFF
+#define R_00D834_SDMA1_STATUS_REG                                       0x00D834
+#define R_008008_GRBM_STATUS2                                           0x008008
+#define   S_008008_ME0PIPE1_CMDFIFO_AVAIL(x)                          (((x) & 0x0F) << 0)
+#define   G_008008_ME0PIPE1_CMDFIFO_AVAIL(x)                          (((x) >> 0) & 0x0F)
+#define   C_008008_ME0PIPE1_CMDFIFO_AVAIL                             0xFFFFFFF0
+#define   S_008008_ME0PIPE1_CF_RQ_PENDING(x)                          (((x) & 0x1) << 4)
+#define   G_008008_ME0PIPE1_CF_RQ_PENDING(x)                          (((x) >> 4) & 0x1)
+#define   C_008008_ME0PIPE1_CF_RQ_PENDING                             0xFFFFFFEF
+#define   S_008008_ME0PIPE1_PF_RQ_PENDING(x)                          (((x) & 0x1) << 5)
+#define   G_008008_ME0PIPE1_PF_RQ_PENDING(x)                          (((x) >> 5) & 0x1)
+#define   C_008008_ME0PIPE1_PF_RQ_PENDING                             0xFFFFFFDF
+#define   S_008008_ME1PIPE0_RQ_PENDING(x)                             (((x) & 0x1) << 6)
+#define   G_008008_ME1PIPE0_RQ_PENDING(x)                             (((x) >> 6) & 0x1)
+#define   C_008008_ME1PIPE0_RQ_PENDING                                0xFFFFFFBF
+#define   S_008008_ME1PIPE1_RQ_PENDING(x)                             (((x) & 0x1) << 7)
+#define   G_008008_ME1PIPE1_RQ_PENDING(x)                             (((x) >> 7) & 0x1)
+#define   C_008008_ME1PIPE1_RQ_PENDING                                0xFFFFFF7F
+#define   S_008008_ME1PIPE2_RQ_PENDING(x)                             (((x) & 0x1) << 8)
+#define   G_008008_ME1PIPE2_RQ_PENDING(x)                             (((x) >> 8) & 0x1)
+#define   C_008008_ME1PIPE2_RQ_PENDING                                0xFFFFFEFF
+#define   S_008008_ME1PIPE3_RQ_PENDING(x)                             (((x) & 0x1) << 9)
+#define   G_008008_ME1PIPE3_RQ_PENDING(x)                             (((x) >> 9) & 0x1)
+#define   C_008008_ME1PIPE3_RQ_PENDING                                0xFFFFFDFF
+#define   S_008008_ME2PIPE0_RQ_PENDING(x)                             (((x) & 0x1) << 10)
+#define   G_008008_ME2PIPE0_RQ_PENDING(x)                             (((x) >> 10) & 0x1)
+#define   C_008008_ME2PIPE0_RQ_PENDING                                0xFFFFFBFF
+#define   S_008008_ME2PIPE1_RQ_PENDING(x)                             (((x) & 0x1) << 11)
+#define   G_008008_ME2PIPE1_RQ_PENDING(x)                             (((x) >> 11) & 0x1)
+#define   C_008008_ME2PIPE1_RQ_PENDING                                0xFFFFF7FF
+#define   S_008008_ME2PIPE2_RQ_PENDING(x)                             (((x) & 0x1) << 12)
+#define   G_008008_ME2PIPE2_RQ_PENDING(x)                             (((x) >> 12) & 0x1)
+#define   C_008008_ME2PIPE2_RQ_PENDING                                0xFFFFEFFF
+#define   S_008008_ME2PIPE3_RQ_PENDING(x)                             (((x) & 0x1) << 13)
+#define   G_008008_ME2PIPE3_RQ_PENDING(x)                             (((x) >> 13) & 0x1)
+#define   C_008008_ME2PIPE3_RQ_PENDING                                0xFFFFDFFF
+#define   S_008008_RLC_RQ_PENDING(x)                                  (((x) & 0x1) << 14)
+#define   G_008008_RLC_RQ_PENDING(x)                                  (((x) >> 14) & 0x1)
+#define   C_008008_RLC_RQ_PENDING                                     0xFFFFBFFF
+#define   S_008008_RLC_BUSY(x)                                        (((x) & 0x1) << 24)
+#define   G_008008_RLC_BUSY(x)                                        (((x) >> 24) & 0x1)
+#define   C_008008_RLC_BUSY                                           0xFEFFFFFF
+#define   S_008008_TC_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008008_TC_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008008_TC_BUSY                                            0xFDFFFFFF
+#define   S_008008_TCC_CC_RESIDENT(x)                                 (((x) & 0x1) << 26)
+#define   G_008008_TCC_CC_RESIDENT(x)                                 (((x) >> 26) & 0x1)
+#define   C_008008_TCC_CC_RESIDENT                                    0xFBFFFFFF
+#define   S_008008_CPF_BUSY(x)                                        (((x) & 0x1) << 28)
+#define   G_008008_CPF_BUSY(x)                                        (((x) >> 28) & 0x1)
+#define   C_008008_CPF_BUSY                                           0xEFFFFFFF
+#define   S_008008_CPC_BUSY(x)                                        (((x) & 0x1) << 29)
+#define   G_008008_CPC_BUSY(x)                                        (((x) >> 29) & 0x1)
+#define   C_008008_CPC_BUSY                                           0xDFFFFFFF
+#define   S_008008_CPG_BUSY(x)                                        (((x) & 0x1) << 30)
+#define   G_008008_CPG_BUSY(x)                                        (((x) >> 30) & 0x1)
+#define   C_008008_CPG_BUSY                                           0xBFFFFFFF
+#define R_008010_GRBM_STATUS                                            0x008010
+#define   S_008010_ME0PIPE0_CMDFIFO_AVAIL(x)                          (((x) & 0x0F) << 0)
+#define   G_008010_ME0PIPE0_CMDFIFO_AVAIL(x)                          (((x) >> 0) & 0x0F)
+#define   C_008010_ME0PIPE0_CMDFIFO_AVAIL                             0xFFFFFFF0
+#define   S_008010_SRBM_RQ_PENDING(x)                                 (((x) & 0x1) << 5)
+#define   G_008010_SRBM_RQ_PENDING(x)                                 (((x) >> 5) & 0x1)
+#define   C_008010_SRBM_RQ_PENDING                                    0xFFFFFFDF
+#define   S_008010_ME0PIPE0_CF_RQ_PENDING(x)                          (((x) & 0x1) << 7)
+#define   G_008010_ME0PIPE0_CF_RQ_PENDING(x)                          (((x) >> 7) & 0x1)
+#define   C_008010_ME0PIPE0_CF_RQ_PENDING                             0xFFFFFF7F
+#define   S_008010_ME0PIPE0_PF_RQ_PENDING(x)                          (((x) & 0x1) << 8)
+#define   G_008010_ME0PIPE0_PF_RQ_PENDING(x)                          (((x) >> 8) & 0x1)
+#define   C_008010_ME0PIPE0_PF_RQ_PENDING                             0xFFFFFEFF
+#define   S_008010_GDS_DMA_RQ_PENDING(x)                              (((x) & 0x1) << 9)
+#define   G_008010_GDS_DMA_RQ_PENDING(x)                              (((x) >> 9) & 0x1)
+#define   C_008010_GDS_DMA_RQ_PENDING                                 0xFFFFFDFF
+#define   S_008010_DB_CLEAN(x)                                        (((x) & 0x1) << 12)
+#define   G_008010_DB_CLEAN(x)                                        (((x) >> 12) & 0x1)
+#define   C_008010_DB_CLEAN                                           0xFFFFEFFF
+#define   S_008010_CB_CLEAN(x)                                        (((x) & 0x1) << 13)
+#define   G_008010_CB_CLEAN(x)                                        (((x) >> 13) & 0x1)
+#define   C_008010_CB_CLEAN                                           0xFFFFDFFF
+#define   S_008010_TA_BUSY(x)                                         (((x) & 0x1) << 14)
+#define   G_008010_TA_BUSY(x)                                         (((x) >> 14) & 0x1)
+#define   C_008010_TA_BUSY                                            0xFFFFBFFF
+#define   S_008010_GDS_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_008010_GDS_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_008010_GDS_BUSY                                           0xFFFF7FFF
+#define   S_008010_WD_BUSY_NO_DMA(x)                                  (((x) & 0x1) << 16)
+#define   G_008010_WD_BUSY_NO_DMA(x)                                  (((x) >> 16) & 0x1)
+#define   C_008010_WD_BUSY_NO_DMA                                     0xFFFEFFFF
+#define   S_008010_VGT_BUSY(x)                                        (((x) & 0x1) << 17)
+#define   G_008010_VGT_BUSY(x)                                        (((x) >> 17) & 0x1)
+#define   C_008010_VGT_BUSY                                           0xFFFDFFFF
+#define   S_008010_IA_BUSY_NO_DMA(x)                                  (((x) & 0x1) << 18)
+#define   G_008010_IA_BUSY_NO_DMA(x)                                  (((x) >> 18) & 0x1)
+#define   C_008010_IA_BUSY_NO_DMA                                     0xFFFBFFFF
+#define   S_008010_IA_BUSY(x)                                         (((x) & 0x1) << 19)
+#define   G_008010_IA_BUSY(x)                                         (((x) >> 19) & 0x1)
+#define   C_008010_IA_BUSY                                            0xFFF7FFFF
+#define   S_008010_SX_BUSY(x)                                         (((x) & 0x1) << 20)
+#define   G_008010_SX_BUSY(x)                                         (((x) >> 20) & 0x1)
+#define   C_008010_SX_BUSY                                            0xFFEFFFFF
+#define   S_008010_WD_BUSY(x)                                         (((x) & 0x1) << 21)
+#define   G_008010_WD_BUSY(x)                                         (((x) >> 21) & 0x1)
+#define   C_008010_WD_BUSY                                            0xFFDFFFFF
+#define   S_008010_SPI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008010_SPI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008010_SPI_BUSY                                           0xFFBFFFFF
+#define   S_008010_BCI_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008010_BCI_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008010_BCI_BUSY                                           0xFF7FFFFF
+#define   S_008010_SC_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008010_SC_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008010_SC_BUSY                                            0xFEFFFFFF
+#define   S_008010_PA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008010_PA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008010_PA_BUSY                                            0xFDFFFFFF
+#define   S_008010_DB_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008010_DB_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008010_DB_BUSY                                            0xFBFFFFFF
+#define   S_008010_CP_COHERENCY_BUSY(x)                               (((x) & 0x1) << 28)
+#define   G_008010_CP_COHERENCY_BUSY(x)                               (((x) >> 28) & 0x1)
+#define   C_008010_CP_COHERENCY_BUSY                                  0xEFFFFFFF
+#define   S_008010_CP_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008010_CP_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008010_CP_BUSY                                            0xDFFFFFFF
+#define   S_008010_CB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008010_CB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008010_CB_BUSY                                            0xBFFFFFFF
+#define   S_008010_GUI_ACTIVE(x)                                      (((x) & 0x1) << 31)
+#define   G_008010_GUI_ACTIVE(x)                                      (((x) >> 31) & 0x1)
+#define   C_008010_GUI_ACTIVE                                         0x7FFFFFFF
 #define GRBM_GFX_INDEX                                                  0x802C
 #define         INSTANCE_INDEX(x)                                     ((x) << 0)
 #define         SH_INDEX(x)                                           ((x) << 8)
@@ -276,12 +668,155 @@
 #define   C_0085F0_SH_ICACHE_ACTION_ENA                               0xDFFFFFFF
 #define R_0085F4_CP_COHER_SIZE                                          0x0085F4
 #define R_0085F8_CP_COHER_BASE                                          0x0085F8
-
+#define R_008014_GRBM_STATUS_SE0                                        0x008014
+#define   S_008014_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008014_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008014_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008014_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008014_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008014_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008014_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008014_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008014_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008014_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008014_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008014_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008014_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008014_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008014_PA_BUSY                                            0xFEFFFFFF
+#define   S_008014_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008014_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008014_TA_BUSY                                            0xFDFFFFFF
+#define   S_008014_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008014_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008014_SX_BUSY                                            0xFBFFFFFF
+#define   S_008014_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008014_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008014_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008014_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008014_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008014_SC_BUSY                                            0xDFFFFFFF
+#define   S_008014_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008014_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008014_DB_BUSY                                            0xBFFFFFFF
+#define   S_008014_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008014_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008014_CB_BUSY                                            0x7FFFFFFF
+#define R_008018_GRBM_STATUS_SE1                                        0x008018
+#define   S_008018_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008018_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008018_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008018_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008018_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008018_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008018_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008018_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008018_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008018_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008018_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008018_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008018_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008018_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008018_PA_BUSY                                            0xFEFFFFFF
+#define   S_008018_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008018_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008018_TA_BUSY                                            0xFDFFFFFF
+#define   S_008018_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008018_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008018_SX_BUSY                                            0xFBFFFFFF
+#define   S_008018_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008018_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008018_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008018_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008018_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008018_SC_BUSY                                            0xDFFFFFFF
+#define   S_008018_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008018_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008018_DB_BUSY                                            0xBFFFFFFF
+#define   S_008018_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008018_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008018_CB_BUSY                                            0x7FFFFFFF
+#define R_008038_GRBM_STATUS_SE2                                        0x008038
+#define   S_008038_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_008038_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_008038_DB_CLEAN                                           0xFFFFFFFD
+#define   S_008038_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_008038_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_008038_CB_CLEAN                                           0xFFFFFFFB
+#define   S_008038_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008038_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008038_BCI_BUSY                                           0xFFBFFFFF
+#define   S_008038_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_008038_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_008038_VGT_BUSY                                           0xFF7FFFFF
+#define   S_008038_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_008038_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_008038_PA_BUSY                                            0xFEFFFFFF
+#define   S_008038_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_008038_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_008038_TA_BUSY                                            0xFDFFFFFF
+#define   S_008038_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008038_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008038_SX_BUSY                                            0xFBFFFFFF
+#define   S_008038_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_008038_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_008038_SPI_BUSY                                           0xF7FFFFFF
+#define   S_008038_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_008038_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_008038_SC_BUSY                                            0xDFFFFFFF
+#define   S_008038_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_008038_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_008038_DB_BUSY                                            0xBFFFFFFF
+#define   S_008038_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008038_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008038_CB_BUSY                                            0x7FFFFFFF
+#define R_00803C_GRBM_STATUS_SE3                                        0x00803C
+#define   S_00803C_DB_CLEAN(x)                                        (((x) & 0x1) << 1)
+#define   G_00803C_DB_CLEAN(x)                                        (((x) >> 1) & 0x1)
+#define   C_00803C_DB_CLEAN                                           0xFFFFFFFD
+#define   S_00803C_CB_CLEAN(x)                                        (((x) & 0x1) << 2)
+#define   G_00803C_CB_CLEAN(x)                                        (((x) >> 2) & 0x1)
+#define   C_00803C_CB_CLEAN                                           0xFFFFFFFB
+#define   S_00803C_BCI_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_00803C_BCI_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_00803C_BCI_BUSY                                           0xFFBFFFFF
+#define   S_00803C_VGT_BUSY(x)                                        (((x) & 0x1) << 23)
+#define   G_00803C_VGT_BUSY(x)                                        (((x) >> 23) & 0x1)
+#define   C_00803C_VGT_BUSY                                           0xFF7FFFFF
+#define   S_00803C_PA_BUSY(x)                                         (((x) & 0x1) << 24)
+#define   G_00803C_PA_BUSY(x)                                         (((x) >> 24) & 0x1)
+#define   C_00803C_PA_BUSY                                            0xFEFFFFFF
+#define   S_00803C_TA_BUSY(x)                                         (((x) & 0x1) << 25)
+#define   G_00803C_TA_BUSY(x)                                         (((x) >> 25) & 0x1)
+#define   C_00803C_TA_BUSY                                            0xFDFFFFFF
+#define   S_00803C_SX_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_00803C_SX_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_00803C_SX_BUSY                                            0xFBFFFFFF
+#define   S_00803C_SPI_BUSY(x)                                        (((x) & 0x1) << 27)
+#define   G_00803C_SPI_BUSY(x)                                        (((x) >> 27) & 0x1)
+#define   C_00803C_SPI_BUSY                                           0xF7FFFFFF
+#define   S_00803C_SC_BUSY(x)                                         (((x) & 0x1) << 29)
+#define   G_00803C_SC_BUSY(x)                                         (((x) >> 29) & 0x1)
+#define   C_00803C_SC_BUSY                                            0xDFFFFFFF
+#define   S_00803C_DB_BUSY(x)                                         (((x) & 0x1) << 30)
+#define   G_00803C_DB_BUSY(x)                                         (((x) >> 30) & 0x1)
+#define   C_00803C_DB_BUSY                                            0xBFFFFFFF
+#define   S_00803C_CB_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_00803C_CB_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_00803C_CB_BUSY                                            0x7FFFFFFF
 /* CIK */
+#define R_0300FC_CP_STRMOUT_CNTL                                        0x0300FC
+#define   S_0300FC_OFFSET_UPDATE_DONE(x)                              (((x) & 0x1) << 0)
+#define   G_0300FC_OFFSET_UPDATE_DONE(x)                              (((x) >> 0) & 0x1)
+#define   C_0300FC_OFFSET_UPDATE_DONE                                 0xFFFFFFFE
 #define R_0301E4_CP_COHER_BASE_HI                                       0x0301E4
 #define   S_0301E4_COHER_BASE_HI_256B(x)                              (((x) & 0xFF) << 0)
 #define   G_0301E4_COHER_BASE_HI_256B(x)                              (((x) >> 0) & 0xFF)
 #define   C_0301E4_COHER_BASE_HI_256B                                 0xFFFFFF00
+#define R_0301EC_CP_COHER_START_DELAY                                   0x0301EC
+#define   S_0301EC_START_DELAY_COUNT(x)                               (((x) & 0x3F) << 0)
+#define   G_0301EC_START_DELAY_COUNT(x)                               (((x) >> 0) & 0x3F)
+#define   C_0301EC_START_DELAY_COUNT                                  0xFFFFFFC0
 #define R_0301F0_CP_COHER_CNTL                                          0x0301F0
 #define   S_0301F0_DEST_BASE_0_ENA(x)                                 (((x) & 0x1) << 0)
 #define   G_0301F0_DEST_BASE_0_ENA(x)                                 (((x) >> 0) & 0x1)
@@ -289,6 +824,14 @@
 #define   S_0301F0_DEST_BASE_1_ENA(x)                                 (((x) & 0x1) << 1)
 #define   G_0301F0_DEST_BASE_1_ENA(x)                                 (((x) >> 1) & 0x1)
 #define   C_0301F0_DEST_BASE_1_ENA                                    0xFFFFFFFD
+/* VI */
+#define   S_0301F0_TC_SD_ACTION_ENA(x)                                (((x) & 0x1) << 2)
+#define   G_0301F0_TC_SD_ACTION_ENA(x)                                (((x) >> 2) & 0x1)
+#define   C_0301F0_TC_SD_ACTION_ENA                                   0xFFFFFFFB
+#define   S_0301F0_TC_NC_ACTION_ENA(x)                                (((x) & 0x1) << 3)
+#define   G_0301F0_TC_NC_ACTION_ENA(x)                                (((x) >> 3) & 0x1)
+#define   C_0301F0_TC_NC_ACTION_ENA                                   0xFFFFFFF7
+/*    */
 #define   S_0301F0_CB0_DEST_BASE_ENA(x)                               (((x) & 0x1) << 6)
 #define   G_0301F0_CB0_DEST_BASE_ENA(x)                               (((x) >> 6) & 0x1)
 #define   C_0301F0_CB0_DEST_BASE_ENA                                  0xFFFFFFBF
@@ -319,7 +862,7 @@
 #define   S_0301F0_TCL1_VOL_ACTION_ENA(x)                             (((x) & 0x1) << 15)
 #define   G_0301F0_TCL1_VOL_ACTION_ENA(x)                             (((x) >> 15) & 0x1)
 #define   C_0301F0_TCL1_VOL_ACTION_ENA                                0xFFFF7FFF
-#define   S_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) & 0x1) << 16)
+#define   S_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) & 0x1) << 16) /* not on VI */
 #define   G_0301F0_TC_VOL_ACTION_ENA(x)                               (((x) >> 16) & 0x1)
 #define   C_0301F0_TC_VOL_ACTION_ENA                                  0xFFFEFFFF
 #define   S_0301F0_TC_WB_ACTION_ENA(x)                                (((x) & 0x1) << 18)
@@ -352,8 +895,389 @@
 #define   S_0301F0_SH_ICACHE_ACTION_ENA(x)                            (((x) & 0x1) << 29)
 #define   G_0301F0_SH_ICACHE_ACTION_ENA(x)                            (((x) >> 29) & 0x1)
 #define   C_0301F0_SH_ICACHE_ACTION_ENA                               0xDFFFFFFF
+/* VI */
+#define   S_0301F0_SH_KCACHE_WB_ACTION_ENA(x)                         (((x) & 0x1) << 30)
+#define   G_0301F0_SH_KCACHE_WB_ACTION_ENA(x)                         (((x) >> 30) & 0x1)
+#define   C_0301F0_SH_KCACHE_WB_ACTION_ENA                            0xBFFFFFFF
+#define   S_0301F0_SH_SD_ACTION_ENA(x)                                (((x) & 0x1) << 31)
+#define   G_0301F0_SH_SD_ACTION_ENA(x)                                (((x) >> 31) & 0x1)
+#define   C_0301F0_SH_SD_ACTION_ENA                                   0x7FFFFFFF
+/*    */
 #define R_0301F4_CP_COHER_SIZE                                          0x0301F4
 #define R_0301F8_CP_COHER_BASE                                          0x0301F8
+#define R_0301FC_CP_COHER_STATUS                                        0x0301FC
+#define   S_0301FC_MATCHING_GFX_CNTX(x)                               (((x) & 0xFF) << 0)
+#define   G_0301FC_MATCHING_GFX_CNTX(x)                               (((x) >> 0) & 0xFF)
+#define   C_0301FC_MATCHING_GFX_CNTX                                  0xFFFFFF00
+#define   S_0301FC_MEID(x)                                            (((x) & 0x03) << 24)
+#define   G_0301FC_MEID(x)                                            (((x) >> 24) & 0x03)
+#define   C_0301FC_MEID                                               0xFCFFFFFF
+#define   S_0301FC_PHASE1_STATUS(x)                                   (((x) & 0x1) << 30)
+#define   G_0301FC_PHASE1_STATUS(x)                                   (((x) >> 30) & 0x1)
+#define   C_0301FC_PHASE1_STATUS                                      0xBFFFFFFF
+#define   S_0301FC_STATUS(x)                                          (((x) & 0x1) << 31)
+#define   G_0301FC_STATUS(x)                                          (((x) >> 31) & 0x1)
+#define   C_0301FC_STATUS                                             0x7FFFFFFF
+#define R_008210_CP_CPC_STATUS                                          0x008210
+#define   S_008210_MEC1_BUSY(x)                                       (((x) & 0x1) << 0)
+#define   G_008210_MEC1_BUSY(x)                                       (((x) >> 0) & 0x1)
+#define   C_008210_MEC1_BUSY                                          0xFFFFFFFE
+#define   S_008210_MEC2_BUSY(x)                                       (((x) & 0x1) << 1)
+#define   G_008210_MEC2_BUSY(x)                                       (((x) >> 1) & 0x1)
+#define   C_008210_MEC2_BUSY                                          0xFFFFFFFD
+#define   S_008210_DC0_BUSY(x)                                        (((x) & 0x1) << 2)
+#define   G_008210_DC0_BUSY(x)                                        (((x) >> 2) & 0x1)
+#define   C_008210_DC0_BUSY                                           0xFFFFFFFB
+#define   S_008210_DC1_BUSY(x)                                        (((x) & 0x1) << 3)
+#define   G_008210_DC1_BUSY(x)                                        (((x) >> 3) & 0x1)
+#define   C_008210_DC1_BUSY                                           0xFFFFFFF7
+#define   S_008210_RCIU1_BUSY(x)                                      (((x) & 0x1) << 4)
+#define   G_008210_RCIU1_BUSY(x)                                      (((x) >> 4) & 0x1)
+#define   C_008210_RCIU1_BUSY                                         0xFFFFFFEF
+#define   S_008210_RCIU2_BUSY(x)                                      (((x) & 0x1) << 5)
+#define   G_008210_RCIU2_BUSY(x)                                      (((x) >> 5) & 0x1)
+#define   C_008210_RCIU2_BUSY                                         0xFFFFFFDF
+#define   S_008210_ROQ1_BUSY(x)                                       (((x) & 0x1) << 6)
+#define   G_008210_ROQ1_BUSY(x)                                       (((x) >> 6) & 0x1)
+#define   C_008210_ROQ1_BUSY                                          0xFFFFFFBF
+#define   S_008210_ROQ2_BUSY(x)                                       (((x) & 0x1) << 7)
+#define   G_008210_ROQ2_BUSY(x)                                       (((x) >> 7) & 0x1)
+#define   C_008210_ROQ2_BUSY                                          0xFFFFFF7F
+#define   S_008210_TCIU_BUSY(x)                                       (((x) & 0x1) << 10)
+#define   G_008210_TCIU_BUSY(x)                                       (((x) >> 10) & 0x1)
+#define   C_008210_TCIU_BUSY                                          0xFFFFFBFF
+#define   S_008210_SCRATCH_RAM_BUSY(x)                                (((x) & 0x1) << 11)
+#define   G_008210_SCRATCH_RAM_BUSY(x)                                (((x) >> 11) & 0x1)
+#define   C_008210_SCRATCH_RAM_BUSY                                   0xFFFFF7FF
+#define   S_008210_QU_BUSY(x)                                         (((x) & 0x1) << 12)
+#define   G_008210_QU_BUSY(x)                                         (((x) >> 12) & 0x1)
+#define   C_008210_QU_BUSY                                            0xFFFFEFFF
+#define   S_008210_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 13)
+#define   G_008210_ATCL2IU_BUSY(x)                                    (((x) >> 13) & 0x1)
+#define   C_008210_ATCL2IU_BUSY                                       0xFFFFDFFF
+#define   S_008210_CPG_CPC_BUSY(x)                                    (((x) & 0x1) << 29)
+#define   G_008210_CPG_CPC_BUSY(x)                                    (((x) >> 29) & 0x1)
+#define   C_008210_CPG_CPC_BUSY                                       0xDFFFFFFF
+#define   S_008210_CPF_CPC_BUSY(x)                                    (((x) & 0x1) << 30)
+#define   G_008210_CPF_CPC_BUSY(x)                                    (((x) >> 30) & 0x1)
+#define   C_008210_CPF_CPC_BUSY                                       0xBFFFFFFF
+#define   S_008210_CPC_BUSY(x)                                        (((x) & 0x1) << 31)
+#define   G_008210_CPC_BUSY(x)                                        (((x) >> 31) & 0x1)
+#define   C_008210_CPC_BUSY                                           0x7FFFFFFF
+#define R_008214_CP_CPC_BUSY_STAT                                       0x008214
+#define   S_008214_MEC1_LOAD_BUSY(x)                                  (((x) & 0x1) << 0)
+#define   G_008214_MEC1_LOAD_BUSY(x)                                  (((x) >> 0) & 0x1)
+#define   C_008214_MEC1_LOAD_BUSY                                     0xFFFFFFFE
+#define   S_008214_MEC1_SEMAPOHRE_BUSY(x)                             (((x) & 0x1) << 1)
+#define   G_008214_MEC1_SEMAPOHRE_BUSY(x)                             (((x) >> 1) & 0x1)
+#define   C_008214_MEC1_SEMAPOHRE_BUSY                                0xFFFFFFFD
+#define   S_008214_MEC1_MUTEX_BUSY(x)                                 (((x) & 0x1) << 2)
+#define   G_008214_MEC1_MUTEX_BUSY(x)                                 (((x) >> 2) & 0x1)
+#define   C_008214_MEC1_MUTEX_BUSY                                    0xFFFFFFFB
+#define   S_008214_MEC1_MESSAGE_BUSY(x)                               (((x) & 0x1) << 3)
+#define   G_008214_MEC1_MESSAGE_BUSY(x)                               (((x) >> 3) & 0x1)
+#define   C_008214_MEC1_MESSAGE_BUSY                                  0xFFFFFFF7
+#define   S_008214_MEC1_EOP_QUEUE_BUSY(x)                             (((x) & 0x1) << 4)
+#define   G_008214_MEC1_EOP_QUEUE_BUSY(x)                             (((x) >> 4) & 0x1)
+#define   C_008214_MEC1_EOP_QUEUE_BUSY                                0xFFFFFFEF
+#define   S_008214_MEC1_IQ_QUEUE_BUSY(x)                              (((x) & 0x1) << 5)
+#define   G_008214_MEC1_IQ_QUEUE_BUSY(x)                              (((x) >> 5) & 0x1)
+#define   C_008214_MEC1_IQ_QUEUE_BUSY                                 0xFFFFFFDF
+#define   S_008214_MEC1_IB_QUEUE_BUSY(x)                              (((x) & 0x1) << 6)
+#define   G_008214_MEC1_IB_QUEUE_BUSY(x)                              (((x) >> 6) & 0x1)
+#define   C_008214_MEC1_IB_QUEUE_BUSY                                 0xFFFFFFBF
+#define   S_008214_MEC1_TC_BUSY(x)                                    (((x) & 0x1) << 7)
+#define   G_008214_MEC1_TC_BUSY(x)                                    (((x) >> 7) & 0x1)
+#define   C_008214_MEC1_TC_BUSY                                       0xFFFFFF7F
+#define   S_008214_MEC1_DMA_BUSY(x)                                   (((x) & 0x1) << 8)
+#define   G_008214_MEC1_DMA_BUSY(x)                                   (((x) >> 8) & 0x1)
+#define   C_008214_MEC1_DMA_BUSY                                      0xFFFFFEFF
+#define   S_008214_MEC1_PARTIAL_FLUSH_BUSY(x)                         (((x) & 0x1) << 9)
+#define   G_008214_MEC1_PARTIAL_FLUSH_BUSY(x)                         (((x) >> 9) & 0x1)
+#define   C_008214_MEC1_PARTIAL_FLUSH_BUSY                            0xFFFFFDFF
+#define   S_008214_MEC1_PIPE0_BUSY(x)                                 (((x) & 0x1) << 10)
+#define   G_008214_MEC1_PIPE0_BUSY(x)                                 (((x) >> 10) & 0x1)
+#define   C_008214_MEC1_PIPE0_BUSY                                    0xFFFFFBFF
+#define   S_008214_MEC1_PIPE1_BUSY(x)                                 (((x) & 0x1) << 11)
+#define   G_008214_MEC1_PIPE1_BUSY(x)                                 (((x) >> 11) & 0x1)
+#define   C_008214_MEC1_PIPE1_BUSY                                    0xFFFFF7FF
+#define   S_008214_MEC1_PIPE2_BUSY(x)                                 (((x) & 0x1) << 12)
+#define   G_008214_MEC1_PIPE2_BUSY(x)                                 (((x) >> 12) & 0x1)
+#define   C_008214_MEC1_PIPE2_BUSY                                    0xFFFFEFFF
+#define   S_008214_MEC1_PIPE3_BUSY(x)                                 (((x) & 0x1) << 13)
+#define   G_008214_MEC1_PIPE3_BUSY(x)                                 (((x) >> 13) & 0x1)
+#define   C_008214_MEC1_PIPE3_BUSY                                    0xFFFFDFFF
+#define   S_008214_MEC2_LOAD_BUSY(x)                                  (((x) & 0x1) << 16)
+#define   G_008214_MEC2_LOAD_BUSY(x)                                  (((x) >> 16) & 0x1)
+#define   C_008214_MEC2_LOAD_BUSY                                     0xFFFEFFFF
+#define   S_008214_MEC2_SEMAPOHRE_BUSY(x)                             (((x) & 0x1) << 17)
+#define   G_008214_MEC2_SEMAPOHRE_BUSY(x)                             (((x) >> 17) & 0x1)
+#define   C_008214_MEC2_SEMAPOHRE_BUSY                                0xFFFDFFFF
+#define   S_008214_MEC2_MUTEX_BUSY(x)                                 (((x) & 0x1) << 18)
+#define   G_008214_MEC2_MUTEX_BUSY(x)                                 (((x) >> 18) & 0x1)
+#define   C_008214_MEC2_MUTEX_BUSY                                    0xFFFBFFFF
+#define   S_008214_MEC2_MESSAGE_BUSY(x)                               (((x) & 0x1) << 19)
+#define   G_008214_MEC2_MESSAGE_BUSY(x)                               (((x) >> 19) & 0x1)
+#define   C_008214_MEC2_MESSAGE_BUSY                                  0xFFF7FFFF
+#define   S_008214_MEC2_EOP_QUEUE_BUSY(x)                             (((x) & 0x1) << 20)
+#define   G_008214_MEC2_EOP_QUEUE_BUSY(x)                             (((x) >> 20) & 0x1)
+#define   C_008214_MEC2_EOP_QUEUE_BUSY                                0xFFEFFFFF
+#define   S_008214_MEC2_IQ_QUEUE_BUSY(x)                              (((x) & 0x1) << 21)
+#define   G_008214_MEC2_IQ_QUEUE_BUSY(x)                              (((x) >> 21) & 0x1)
+#define   C_008214_MEC2_IQ_QUEUE_BUSY                                 0xFFDFFFFF
+#define   S_008214_MEC2_IB_QUEUE_BUSY(x)                              (((x) & 0x1) << 22)
+#define   G_008214_MEC2_IB_QUEUE_BUSY(x)                              (((x) >> 22) & 0x1)
+#define   C_008214_MEC2_IB_QUEUE_BUSY                                 0xFFBFFFFF
+#define   S_008214_MEC2_TC_BUSY(x)                                    (((x) & 0x1) << 23)
+#define   G_008214_MEC2_TC_BUSY(x)                                    (((x) >> 23) & 0x1)
+#define   C_008214_MEC2_TC_BUSY                                       0xFF7FFFFF
+#define   S_008214_MEC2_DMA_BUSY(x)                                   (((x) & 0x1) << 24)
+#define   G_008214_MEC2_DMA_BUSY(x)                                   (((x) >> 24) & 0x1)
+#define   C_008214_MEC2_DMA_BUSY                                      0xFEFFFFFF
+#define   S_008214_MEC2_PARTIAL_FLUSH_BUSY(x)                         (((x) & 0x1) << 25)
+#define   G_008214_MEC2_PARTIAL_FLUSH_BUSY(x)                         (((x) >> 25) & 0x1)
+#define   C_008214_MEC2_PARTIAL_FLUSH_BUSY                            0xFDFFFFFF
+#define   S_008214_MEC2_PIPE0_BUSY(x)                                 (((x) & 0x1) << 26)
+#define   G_008214_MEC2_PIPE0_BUSY(x)                                 (((x) >> 26) & 0x1)
+#define   C_008214_MEC2_PIPE0_BUSY                                    0xFBFFFFFF
+#define   S_008214_MEC2_PIPE1_BUSY(x)                                 (((x) & 0x1) << 27)
+#define   G_008214_MEC2_PIPE1_BUSY(x)                                 (((x) >> 27) & 0x1)
+#define   C_008214_MEC2_PIPE1_BUSY                                    0xF7FFFFFF
+#define   S_008214_MEC2_PIPE2_BUSY(x)                                 (((x) & 0x1) << 28)
+#define   G_008214_MEC2_PIPE2_BUSY(x)                                 (((x) >> 28) & 0x1)
+#define   C_008214_MEC2_PIPE2_BUSY                                    0xEFFFFFFF
+#define   S_008214_MEC2_PIPE3_BUSY(x)                                 (((x) & 0x1) << 29)
+#define   G_008214_MEC2_PIPE3_BUSY(x)                                 (((x) >> 29) & 0x1)
+#define   C_008214_MEC2_PIPE3_BUSY                                    0xDFFFFFFF
+#define R_008218_CP_CPC_STALLED_STAT1                                   0x008218
+#define   S_008218_RCIU_TX_FREE_STALL(x)                              (((x) & 0x1) << 3)
+#define   G_008218_RCIU_TX_FREE_STALL(x)                              (((x) >> 3) & 0x1)
+#define   C_008218_RCIU_TX_FREE_STALL                                 0xFFFFFFF7
+#define   S_008218_RCIU_PRIV_VIOLATION(x)                             (((x) & 0x1) << 4)
+#define   G_008218_RCIU_PRIV_VIOLATION(x)                             (((x) >> 4) & 0x1)
+#define   C_008218_RCIU_PRIV_VIOLATION                                0xFFFFFFEF
+#define   S_008218_TCIU_TX_FREE_STALL(x)                              (((x) & 0x1) << 6)
+#define   G_008218_TCIU_TX_FREE_STALL(x)                              (((x) >> 6) & 0x1)
+#define   C_008218_TCIU_TX_FREE_STALL                                 0xFFFFFFBF
+#define   S_008218_MEC1_DECODING_PACKET(x)                            (((x) & 0x1) << 8)
+#define   G_008218_MEC1_DECODING_PACKET(x)                            (((x) >> 8) & 0x1)
+#define   C_008218_MEC1_DECODING_PACKET                               0xFFFFFEFF
+#define   S_008218_MEC1_WAIT_ON_RCIU(x)                               (((x) & 0x1) << 9)
+#define   G_008218_MEC1_WAIT_ON_RCIU(x)                               (((x) >> 9) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_RCIU                                  0xFFFFFDFF
+#define   S_008218_MEC1_WAIT_ON_RCIU_READ(x)                          (((x) & 0x1) << 10)
+#define   G_008218_MEC1_WAIT_ON_RCIU_READ(x)                          (((x) >> 10) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_RCIU_READ                             0xFFFFFBFF
+#define   S_008218_MEC1_WAIT_ON_ROQ_DATA(x)                           (((x) & 0x1) << 13)
+#define   G_008218_MEC1_WAIT_ON_ROQ_DATA(x)                           (((x) >> 13) & 0x1)
+#define   C_008218_MEC1_WAIT_ON_ROQ_DATA                              0xFFFFDFFF
+#define   S_008218_MEC2_DECODING_PACKET(x)                            (((x) & 0x1) << 16)
+#define   G_008218_MEC2_DECODING_PACKET(x)                            (((x) >> 16) & 0x1)
+#define   C_008218_MEC2_DECODING_PACKET                               0xFFFEFFFF
+#define   S_008218_MEC2_WAIT_ON_RCIU(x)                               (((x) & 0x1) << 17)
+#define   G_008218_MEC2_WAIT_ON_RCIU(x)                               (((x) >> 17) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_RCIU                                  0xFFFDFFFF
+#define   S_008218_MEC2_WAIT_ON_RCIU_READ(x)                          (((x) & 0x1) << 18)
+#define   G_008218_MEC2_WAIT_ON_RCIU_READ(x)                          (((x) >> 18) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_RCIU_READ                             0xFFFBFFFF
+#define   S_008218_MEC2_WAIT_ON_ROQ_DATA(x)                           (((x) & 0x1) << 21)
+#define   G_008218_MEC2_WAIT_ON_ROQ_DATA(x)                           (((x) >> 21) & 0x1)
+#define   C_008218_MEC2_WAIT_ON_ROQ_DATA                              0xFFDFFFFF
+#define   S_008218_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 22)
+#define   G_008218_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 22) & 0x1)
+#define   C_008218_ATCL2IU_WAITING_ON_FREE                            0xFFBFFFFF
+#define   S_008218_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 23)
+#define   G_008218_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 23) & 0x1)
+#define   C_008218_ATCL2IU_WAITING_ON_TAGS                            0xFF7FFFFF
+#define   S_008218_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 24)
+#define   G_008218_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 24) & 0x1)
+#define   C_008218_ATCL1_WAITING_ON_TRANS                             0xFEFFFFFF
+#define R_00821C_CP_CPF_STATUS                                          0x00821C
+#define   S_00821C_POST_WPTR_GFX_BUSY(x)                              (((x) & 0x1) << 0)
+#define   G_00821C_POST_WPTR_GFX_BUSY(x)                              (((x) >> 0) & 0x1)
+#define   C_00821C_POST_WPTR_GFX_BUSY                                 0xFFFFFFFE
+#define   S_00821C_CSF_BUSY(x)                                        (((x) & 0x1) << 1)
+#define   G_00821C_CSF_BUSY(x)                                        (((x) >> 1) & 0x1)
+#define   C_00821C_CSF_BUSY                                           0xFFFFFFFD
+#define   S_00821C_ROQ_ALIGN_BUSY(x)                                  (((x) & 0x1) << 4)
+#define   G_00821C_ROQ_ALIGN_BUSY(x)                                  (((x) >> 4) & 0x1)
+#define   C_00821C_ROQ_ALIGN_BUSY                                     0xFFFFFFEF
+#define   S_00821C_ROQ_RING_BUSY(x)                                   (((x) & 0x1) << 5)
+#define   G_00821C_ROQ_RING_BUSY(x)                                   (((x) >> 5) & 0x1)
+#define   C_00821C_ROQ_RING_BUSY                                      0xFFFFFFDF
+#define   S_00821C_ROQ_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 6)
+#define   G_00821C_ROQ_INDIRECT1_BUSY(x)                              (((x) >> 6) & 0x1)
+#define   C_00821C_ROQ_INDIRECT1_BUSY                                 0xFFFFFFBF
+#define   S_00821C_ROQ_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 7)
+#define   G_00821C_ROQ_INDIRECT2_BUSY(x)                              (((x) >> 7) & 0x1)
+#define   C_00821C_ROQ_INDIRECT2_BUSY                                 0xFFFFFF7F
+#define   S_00821C_ROQ_STATE_BUSY(x)                                  (((x) & 0x1) << 8)
+#define   G_00821C_ROQ_STATE_BUSY(x)                                  (((x) >> 8) & 0x1)
+#define   C_00821C_ROQ_STATE_BUSY                                     0xFFFFFEFF
+#define   S_00821C_ROQ_CE_RING_BUSY(x)                                (((x) & 0x1) << 9)
+#define   G_00821C_ROQ_CE_RING_BUSY(x)                                (((x) >> 9) & 0x1)
+#define   C_00821C_ROQ_CE_RING_BUSY                                   0xFFFFFDFF
+#define   S_00821C_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) & 0x1) << 10)
+#define   G_00821C_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) >> 10) & 0x1)
+#define   C_00821C_ROQ_CE_INDIRECT1_BUSY                              0xFFFFFBFF
+#define   S_00821C_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) & 0x1) << 11)
+#define   G_00821C_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) >> 11) & 0x1)
+#define   C_00821C_ROQ_CE_INDIRECT2_BUSY                              0xFFFFF7FF
+#define   S_00821C_SEMAPHORE_BUSY(x)                                  (((x) & 0x1) << 12)
+#define   G_00821C_SEMAPHORE_BUSY(x)                                  (((x) >> 12) & 0x1)
+#define   C_00821C_SEMAPHORE_BUSY                                     0xFFFFEFFF
+#define   S_00821C_INTERRUPT_BUSY(x)                                  (((x) & 0x1) << 13)
+#define   G_00821C_INTERRUPT_BUSY(x)                                  (((x) >> 13) & 0x1)
+#define   C_00821C_INTERRUPT_BUSY                                     0xFFFFDFFF
+#define   S_00821C_TCIU_BUSY(x)                                       (((x) & 0x1) << 14)
+#define   G_00821C_TCIU_BUSY(x)                                       (((x) >> 14) & 0x1)
+#define   C_00821C_TCIU_BUSY                                          0xFFFFBFFF
+#define   S_00821C_HQD_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_00821C_HQD_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_00821C_HQD_BUSY                                           0xFFFF7FFF
+#define   S_00821C_PRT_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_00821C_PRT_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_00821C_PRT_BUSY                                           0xFFFEFFFF
+#define   S_00821C_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 17)
+#define   G_00821C_ATCL2IU_BUSY(x)                                    (((x) >> 17) & 0x1)
+#define   C_00821C_ATCL2IU_BUSY                                       0xFFFDFFFF
+#define   S_00821C_CPF_GFX_BUSY(x)                                    (((x) & 0x1) << 26)
+#define   G_00821C_CPF_GFX_BUSY(x)                                    (((x) >> 26) & 0x1)
+#define   C_00821C_CPF_GFX_BUSY                                       0xFBFFFFFF
+#define   S_00821C_CPF_CMP_BUSY(x)                                    (((x) & 0x1) << 27)
+#define   G_00821C_CPF_CMP_BUSY(x)                                    (((x) >> 27) & 0x1)
+#define   C_00821C_CPF_CMP_BUSY                                       0xF7FFFFFF
+#define   S_00821C_GRBM_CPF_STAT_BUSY(x)                              (((x) & 0x03) << 28)
+#define   G_00821C_GRBM_CPF_STAT_BUSY(x)                              (((x) >> 28) & 0x03)
+#define   C_00821C_GRBM_CPF_STAT_BUSY                                 0xCFFFFFFF
+#define   S_00821C_CPC_CPF_BUSY(x)                                    (((x) & 0x1) << 30)
+#define   G_00821C_CPC_CPF_BUSY(x)                                    (((x) >> 30) & 0x1)
+#define   C_00821C_CPC_CPF_BUSY                                       0xBFFFFFFF
+#define   S_00821C_CPF_BUSY(x)                                        (((x) & 0x1) << 31)
+#define   G_00821C_CPF_BUSY(x)                                        (((x) >> 31) & 0x1)
+#define   C_00821C_CPF_BUSY                                           0x7FFFFFFF
+#define R_008220_CP_CPF_BUSY_STAT                                       0x008220
+#define   S_008220_REG_BUS_FIFO_BUSY(x)                               (((x) & 0x1) << 0)
+#define   G_008220_REG_BUS_FIFO_BUSY(x)                               (((x) >> 0) & 0x1)
+#define   C_008220_REG_BUS_FIFO_BUSY                                  0xFFFFFFFE
+#define   S_008220_CSF_RING_BUSY(x)                                   (((x) & 0x1) << 1)
+#define   G_008220_CSF_RING_BUSY(x)                                   (((x) >> 1) & 0x1)
+#define   C_008220_CSF_RING_BUSY                                      0xFFFFFFFD
+#define   S_008220_CSF_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 2)
+#define   G_008220_CSF_INDIRECT1_BUSY(x)                              (((x) >> 2) & 0x1)
+#define   C_008220_CSF_INDIRECT1_BUSY                                 0xFFFFFFFB
+#define   S_008220_CSF_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 3)
+#define   G_008220_CSF_INDIRECT2_BUSY(x)                              (((x) >> 3) & 0x1)
+#define   C_008220_CSF_INDIRECT2_BUSY                                 0xFFFFFFF7
+#define   S_008220_CSF_STATE_BUSY(x)                                  (((x) & 0x1) << 4)
+#define   G_008220_CSF_STATE_BUSY(x)                                  (((x) >> 4) & 0x1)
+#define   C_008220_CSF_STATE_BUSY                                     0xFFFFFFEF
+#define   S_008220_CSF_CE_INDR1_BUSY(x)                               (((x) & 0x1) << 5)
+#define   G_008220_CSF_CE_INDR1_BUSY(x)                               (((x) >> 5) & 0x1)
+#define   C_008220_CSF_CE_INDR1_BUSY                                  0xFFFFFFDF
+#define   S_008220_CSF_CE_INDR2_BUSY(x)                               (((x) & 0x1) << 6)
+#define   G_008220_CSF_CE_INDR2_BUSY(x)                               (((x) >> 6) & 0x1)
+#define   C_008220_CSF_CE_INDR2_BUSY                                  0xFFFFFFBF
+#define   S_008220_CSF_ARBITER_BUSY(x)                                (((x) & 0x1) << 7)
+#define   G_008220_CSF_ARBITER_BUSY(x)                                (((x) >> 7) & 0x1)
+#define   C_008220_CSF_ARBITER_BUSY                                   0xFFFFFF7F
+#define   S_008220_CSF_INPUT_BUSY(x)                                  (((x) & 0x1) << 8)
+#define   G_008220_CSF_INPUT_BUSY(x)                                  (((x) >> 8) & 0x1)
+#define   C_008220_CSF_INPUT_BUSY                                     0xFFFFFEFF
+#define   S_008220_OUTSTANDING_READ_TAGS(x)                           (((x) & 0x1) << 9)
+#define   G_008220_OUTSTANDING_READ_TAGS(x)                           (((x) >> 9) & 0x1)
+#define   C_008220_OUTSTANDING_READ_TAGS                              0xFFFFFDFF
+#define   S_008220_HPD_PROCESSING_EOP_BUSY(x)                         (((x) & 0x1) << 11)
+#define   G_008220_HPD_PROCESSING_EOP_BUSY(x)                         (((x) >> 11) & 0x1)
+#define   C_008220_HPD_PROCESSING_EOP_BUSY                            0xFFFFF7FF
+#define   S_008220_HQD_DISPATCH_BUSY(x)                               (((x) & 0x1) << 12)
+#define   G_008220_HQD_DISPATCH_BUSY(x)                               (((x) >> 12) & 0x1)
+#define   C_008220_HQD_DISPATCH_BUSY                                  0xFFFFEFFF
+#define   S_008220_HQD_IQ_TIMER_BUSY(x)                               (((x) & 0x1) << 13)
+#define   G_008220_HQD_IQ_TIMER_BUSY(x)                               (((x) >> 13) & 0x1)
+#define   C_008220_HQD_IQ_TIMER_BUSY                                  0xFFFFDFFF
+#define   S_008220_HQD_DMA_OFFLOAD_BUSY(x)                            (((x) & 0x1) << 14)
+#define   G_008220_HQD_DMA_OFFLOAD_BUSY(x)                            (((x) >> 14) & 0x1)
+#define   C_008220_HQD_DMA_OFFLOAD_BUSY                               0xFFFFBFFF
+#define   S_008220_HQD_WAIT_SEMAPHORE_BUSY(x)                         (((x) & 0x1) << 15)
+#define   G_008220_HQD_WAIT_SEMAPHORE_BUSY(x)                         (((x) >> 15) & 0x1)
+#define   C_008220_HQD_WAIT_SEMAPHORE_BUSY                            0xFFFF7FFF
+#define   S_008220_HQD_SIGNAL_SEMAPHORE_BUSY(x)                       (((x) & 0x1) << 16)
+#define   G_008220_HQD_SIGNAL_SEMAPHORE_BUSY(x)                       (((x) >> 16) & 0x1)
+#define   C_008220_HQD_SIGNAL_SEMAPHORE_BUSY                          0xFFFEFFFF
+#define   S_008220_HQD_MESSAGE_BUSY(x)                                (((x) & 0x1) << 17)
+#define   G_008220_HQD_MESSAGE_BUSY(x)                                (((x) >> 17) & 0x1)
+#define   C_008220_HQD_MESSAGE_BUSY                                   0xFFFDFFFF
+#define   S_008220_HQD_PQ_FETCHER_BUSY(x)                             (((x) & 0x1) << 18)
+#define   G_008220_HQD_PQ_FETCHER_BUSY(x)                             (((x) >> 18) & 0x1)
+#define   C_008220_HQD_PQ_FETCHER_BUSY                                0xFFFBFFFF
+#define   S_008220_HQD_IB_FETCHER_BUSY(x)                             (((x) & 0x1) << 19)
+#define   G_008220_HQD_IB_FETCHER_BUSY(x)                             (((x) >> 19) & 0x1)
+#define   C_008220_HQD_IB_FETCHER_BUSY                                0xFFF7FFFF
+#define   S_008220_HQD_IQ_FETCHER_BUSY(x)                             (((x) & 0x1) << 20)
+#define   G_008220_HQD_IQ_FETCHER_BUSY(x)                             (((x) >> 20) & 0x1)
+#define   C_008220_HQD_IQ_FETCHER_BUSY                                0xFFEFFFFF
+#define   S_008220_HQD_EOP_FETCHER_BUSY(x)                            (((x) & 0x1) << 21)
+#define   G_008220_HQD_EOP_FETCHER_BUSY(x)                            (((x) >> 21) & 0x1)
+#define   C_008220_HQD_EOP_FETCHER_BUSY                               0xFFDFFFFF
+#define   S_008220_HQD_CONSUMED_RPTR_BUSY(x)                          (((x) & 0x1) << 22)
+#define   G_008220_HQD_CONSUMED_RPTR_BUSY(x)                          (((x) >> 22) & 0x1)
+#define   C_008220_HQD_CONSUMED_RPTR_BUSY                             0xFFBFFFFF
+#define   S_008220_HQD_FETCHER_ARB_BUSY(x)                            (((x) & 0x1) << 23)
+#define   G_008220_HQD_FETCHER_ARB_BUSY(x)                            (((x) >> 23) & 0x1)
+#define   C_008220_HQD_FETCHER_ARB_BUSY                               0xFF7FFFFF
+#define   S_008220_HQD_ROQ_ALIGN_BUSY(x)                              (((x) & 0x1) << 24)
+#define   G_008220_HQD_ROQ_ALIGN_BUSY(x)                              (((x) >> 24) & 0x1)
+#define   C_008220_HQD_ROQ_ALIGN_BUSY                                 0xFEFFFFFF
+#define   S_008220_HQD_ROQ_EOP_BUSY(x)                                (((x) & 0x1) << 25)
+#define   G_008220_HQD_ROQ_EOP_BUSY(x)                                (((x) >> 25) & 0x1)
+#define   C_008220_HQD_ROQ_EOP_BUSY                                   0xFDFFFFFF
+#define   S_008220_HQD_ROQ_IQ_BUSY(x)                                 (((x) & 0x1) << 26)
+#define   G_008220_HQD_ROQ_IQ_BUSY(x)                                 (((x) >> 26) & 0x1)
+#define   C_008220_HQD_ROQ_IQ_BUSY                                    0xFBFFFFFF
+#define   S_008220_HQD_ROQ_PQ_BUSY(x)                                 (((x) & 0x1) << 27)
+#define   G_008220_HQD_ROQ_PQ_BUSY(x)                                 (((x) >> 27) & 0x1)
+#define   C_008220_HQD_ROQ_PQ_BUSY                                    0xF7FFFFFF
+#define   S_008220_HQD_ROQ_IB_BUSY(x)                                 (((x) & 0x1) << 28)
+#define   G_008220_HQD_ROQ_IB_BUSY(x)                                 (((x) >> 28) & 0x1)
+#define   C_008220_HQD_ROQ_IB_BUSY                                    0xEFFFFFFF
+#define   S_008220_HQD_WPTR_POLL_BUSY(x)                              (((x) & 0x1) << 29)
+#define   G_008220_HQD_WPTR_POLL_BUSY(x)                              (((x) >> 29) & 0x1)
+#define   C_008220_HQD_WPTR_POLL_BUSY                                 0xDFFFFFFF
+#define   S_008220_HQD_PQ_BUSY(x)                                     (((x) & 0x1) << 30)
+#define   G_008220_HQD_PQ_BUSY(x)                                     (((x) >> 30) & 0x1)
+#define   C_008220_HQD_PQ_BUSY                                        0xBFFFFFFF
+#define   S_008220_HQD_IB_BUSY(x)                                     (((x) & 0x1) << 31)
+#define   G_008220_HQD_IB_BUSY(x)                                     (((x) >> 31) & 0x1)
+#define   C_008220_HQD_IB_BUSY                                        0x7FFFFFFF
+#define R_008224_CP_CPF_STALLED_STAT1                                   0x008224
+#define   S_008224_RING_FETCHING_DATA(x)                              (((x) & 0x1) << 0)
+#define   G_008224_RING_FETCHING_DATA(x)                              (((x) >> 0) & 0x1)
+#define   C_008224_RING_FETCHING_DATA                                 0xFFFFFFFE
+#define   S_008224_INDR1_FETCHING_DATA(x)                             (((x) & 0x1) << 1)
+#define   G_008224_INDR1_FETCHING_DATA(x)                             (((x) >> 1) & 0x1)
+#define   C_008224_INDR1_FETCHING_DATA                                0xFFFFFFFD
+#define   S_008224_INDR2_FETCHING_DATA(x)                             (((x) & 0x1) << 2)
+#define   G_008224_INDR2_FETCHING_DATA(x)                             (((x) >> 2) & 0x1)
+#define   C_008224_INDR2_FETCHING_DATA                                0xFFFFFFFB
+#define   S_008224_STATE_FETCHING_DATA(x)                             (((x) & 0x1) << 3)
+#define   G_008224_STATE_FETCHING_DATA(x)                             (((x) >> 3) & 0x1)
+#define   C_008224_STATE_FETCHING_DATA                                0xFFFFFFF7
+#define   S_008224_TCIU_WAITING_ON_FREE(x)                            (((x) & 0x1) << 5)
+#define   G_008224_TCIU_WAITING_ON_FREE(x)                            (((x) >> 5) & 0x1)
+#define   C_008224_TCIU_WAITING_ON_FREE                               0xFFFFFFDF
+#define   S_008224_TCIU_WAITING_ON_TAGS(x)                            (((x) & 0x1) << 6)
+#define   G_008224_TCIU_WAITING_ON_TAGS(x)                            (((x) >> 6) & 0x1)
+#define   C_008224_TCIU_WAITING_ON_TAGS                               0xFFFFFFBF
+#define   S_008224_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 7)
+#define   G_008224_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 7) & 0x1)
+#define   C_008224_ATCL2IU_WAITING_ON_FREE                            0xFFFFFF7F
+#define   S_008224_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 8)
+#define   G_008224_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 8) & 0x1)
+#define   C_008224_ATCL2IU_WAITING_ON_TAGS                            0xFFFFFEFF
+#define   S_008224_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 9)
+#define   G_008224_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 9) & 0x1)
+#define   C_008224_ATCL1_WAITING_ON_TRANS                             0xFFFFFDFF
 #define R_030230_CP_COHER_SIZE_HI                                       0x030230
 #define   S_030230_COHER_SIZE_HI_256B(x)                              (((x) & 0xFF) << 0)
 #define   G_030230_COHER_SIZE_HI_256B(x)                              (((x) >> 0) & 0xFF)
@@ -375,10 +1299,6 @@
 #define   C_0088C4_ES_LIMIT                                           0xFFE0FFFF
 #define R_0088C8_VGT_ESGS_RING_SIZE                                     0x0088C8
 #define R_0088CC_VGT_GSVS_RING_SIZE                                     0x0088CC
-/* CIK */
-#define R_030900_VGT_ESGS_RING_SIZE                                     0x030900
-#define R_030904_VGT_GSVS_RING_SIZE                                     0x030904
-/*     */
 #define R_0088D4_VGT_GS_VERTEX_REUSE                                    0x0088D4
 #define   S_0088D4_VERT_REUSE(x)                                      (((x) & 0x1F) << 0)
 #define   G_0088D4_VERT_REUSE(x)                                      (((x) >> 0) & 0x1F)
@@ -461,7 +1381,293 @@
 #define   S_008B10_CURRENT_COUNT(x)                                   (((x) & 0xFF) << 8)
 #define   G_008B10_CURRENT_COUNT(x)                                   (((x) >> 8) & 0xFF)
 #define   C_008B10_CURRENT_COUNT                                      0xFFFF00FF
+#define R_008670_CP_STALLED_STAT3                                       0x008670
+#define   S_008670_CE_TO_CSF_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 0)
+#define   G_008670_CE_TO_CSF_NOT_RDY_TO_RCV(x)                        (((x) >> 0) & 0x1)
+#define   C_008670_CE_TO_CSF_NOT_RDY_TO_RCV                           0xFFFFFFFE
+#define   S_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV(x)           (((x) & 0x1) << 1)
+#define   G_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV(x)           (((x) >> 1) & 0x1)
+#define   C_008670_CE_TO_RAM_INIT_FETCHER_NOT_RDY_TO_RCV              0xFFFFFFFD
+#define   S_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER(x)        (((x) & 0x1) << 2)
+#define   G_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER(x)        (((x) >> 2) & 0x1)
+#define   C_008670_CE_WAITING_ON_DATA_FROM_RAM_INIT_FETCHER           0xFFFFFFFB
+#define   S_008670_CE_TO_RAM_INIT_NOT_RDY(x)                          (((x) & 0x1) << 3)
+#define   G_008670_CE_TO_RAM_INIT_NOT_RDY(x)                          (((x) >> 3) & 0x1)
+#define   C_008670_CE_TO_RAM_INIT_NOT_RDY                             0xFFFFFFF7
+#define   S_008670_CE_TO_RAM_DUMP_NOT_RDY(x)                          (((x) & 0x1) << 4)
+#define   G_008670_CE_TO_RAM_DUMP_NOT_RDY(x)                          (((x) >> 4) & 0x1)
+#define   C_008670_CE_TO_RAM_DUMP_NOT_RDY                             0xFFFFFFEF
+#define   S_008670_CE_TO_RAM_WRITE_NOT_RDY(x)                         (((x) & 0x1) << 5)
+#define   G_008670_CE_TO_RAM_WRITE_NOT_RDY(x)                         (((x) >> 5) & 0x1)
+#define   C_008670_CE_TO_RAM_WRITE_NOT_RDY                            0xFFFFFFDF
+#define   S_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV(x)                   (((x) & 0x1) << 6)
+#define   G_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV(x)                   (((x) >> 6) & 0x1)
+#define   C_008670_CE_TO_INC_FIFO_NOT_RDY_TO_RCV                      0xFFFFFFBF
+#define   S_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV(x)                    (((x) & 0x1) << 7)
+#define   G_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV(x)                    (((x) >> 7) & 0x1)
+#define   C_008670_CE_TO_WR_FIFO_NOT_RDY_TO_RCV                       0xFFFFFF7F
+#define   S_008670_CE_WAITING_ON_BUFFER_DATA(x)                       (((x) & 0x1) << 10)
+#define   G_008670_CE_WAITING_ON_BUFFER_DATA(x)                       (((x) >> 10) & 0x1)
+#define   C_008670_CE_WAITING_ON_BUFFER_DATA                          0xFFFFFBFF
+#define   S_008670_CE_WAITING_ON_CE_BUFFER_FLAG(x)                    (((x) & 0x1) << 11)
+#define   G_008670_CE_WAITING_ON_CE_BUFFER_FLAG(x)                    (((x) >> 11) & 0x1)
+#define   C_008670_CE_WAITING_ON_CE_BUFFER_FLAG                       0xFFFFF7FF
+#define   S_008670_CE_WAITING_ON_DE_COUNTER(x)                        (((x) & 0x1) << 12)
+#define   G_008670_CE_WAITING_ON_DE_COUNTER(x)                        (((x) >> 12) & 0x1)
+#define   C_008670_CE_WAITING_ON_DE_COUNTER                           0xFFFFEFFF
+#define   S_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW(x)              (((x) & 0x1) << 13)
+#define   G_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW(x)              (((x) >> 13) & 0x1)
+#define   C_008670_CE_WAITING_ON_DE_COUNTER_UNDERFLOW                 0xFFFFDFFF
+#define   S_008670_TCIU_WAITING_ON_FREE(x)                            (((x) & 0x1) << 14)
+#define   G_008670_TCIU_WAITING_ON_FREE(x)                            (((x) >> 14) & 0x1)
+#define   C_008670_TCIU_WAITING_ON_FREE                               0xFFFFBFFF
+#define   S_008670_TCIU_WAITING_ON_TAGS(x)                            (((x) & 0x1) << 15)
+#define   G_008670_TCIU_WAITING_ON_TAGS(x)                            (((x) >> 15) & 0x1)
+#define   C_008670_TCIU_WAITING_ON_TAGS                               0xFFFF7FFF
+#define   S_008670_CE_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) & 0x1) << 16)
+#define   G_008670_CE_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) >> 16) & 0x1)
+#define   C_008670_CE_STALLED_ON_TC_WR_CONFIRM                        0xFFFEFFFF
+#define   S_008670_CE_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) & 0x1) << 17)
+#define   G_008670_CE_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) >> 17) & 0x1)
+#define   C_008670_CE_STALLED_ON_ATOMIC_RTN_DATA                      0xFFFDFFFF
+#define   S_008670_ATCL2IU_WAITING_ON_FREE(x)                         (((x) & 0x1) << 18)
+#define   G_008670_ATCL2IU_WAITING_ON_FREE(x)                         (((x) >> 18) & 0x1)
+#define   C_008670_ATCL2IU_WAITING_ON_FREE                            0xFFFBFFFF
+#define   S_008670_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) & 0x1) << 19)
+#define   G_008670_ATCL2IU_WAITING_ON_TAGS(x)                         (((x) >> 19) & 0x1)
+#define   C_008670_ATCL2IU_WAITING_ON_TAGS                            0xFFF7FFFF
+#define   S_008670_ATCL1_WAITING_ON_TRANS(x)                          (((x) & 0x1) << 20)
+#define   G_008670_ATCL1_WAITING_ON_TRANS(x)                          (((x) >> 20) & 0x1)
+#define   C_008670_ATCL1_WAITING_ON_TRANS                             0xFFEFFFFF
+#define R_008674_CP_STALLED_STAT1                                       0x008674
+#define   S_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 0)
+#define   G_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV(x)                      (((x) >> 0) & 0x1)
+#define   C_008674_RBIU_TO_DMA_NOT_RDY_TO_RCV                         0xFFFFFFFE
+#define   S_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 2)
+#define   G_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV(x)                      (((x) >> 2) & 0x1)
+#define   C_008674_RBIU_TO_SEM_NOT_RDY_TO_RCV                         0xFFFFFFFB
+#define   S_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV(x)                    (((x) & 0x1) << 4)
+#define   G_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV(x)                    (((x) >> 4) & 0x1)
+#define   C_008674_RBIU_TO_MEMWR_NOT_RDY_TO_RCV                       0xFFFFFFEF
+#define   S_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG(x)                    (((x) & 0x1) << 10)
+#define   G_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG(x)                    (((x) >> 10) & 0x1)
+#define   C_008674_ME_HAS_ACTIVE_CE_BUFFER_FLAG                       0xFFFFFBFF
+#define   S_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG(x)                    (((x) & 0x1) << 11)
+#define   G_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG(x)                    (((x) >> 11) & 0x1)
+#define   C_008674_ME_HAS_ACTIVE_DE_BUFFER_FLAG                       0xFFFFF7FF
+#define   S_008674_ME_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) & 0x1) << 12)
+#define   G_008674_ME_STALLED_ON_TC_WR_CONFIRM(x)                     (((x) >> 12) & 0x1)
+#define   C_008674_ME_STALLED_ON_TC_WR_CONFIRM                        0xFFFFEFFF
+#define   S_008674_ME_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) & 0x1) << 13)
+#define   G_008674_ME_STALLED_ON_ATOMIC_RTN_DATA(x)                   (((x) >> 13) & 0x1)
+#define   C_008674_ME_STALLED_ON_ATOMIC_RTN_DATA                      0xFFFFDFFF
+#define   S_008674_ME_WAITING_ON_TC_READ_DATA(x)                      (((x) & 0x1) << 14)
+#define   G_008674_ME_WAITING_ON_TC_READ_DATA(x)                      (((x) >> 14) & 0x1)
+#define   C_008674_ME_WAITING_ON_TC_READ_DATA                         0xFFFFBFFF
+#define   S_008674_ME_WAITING_ON_REG_READ_DATA(x)                     (((x) & 0x1) << 15)
+#define   G_008674_ME_WAITING_ON_REG_READ_DATA(x)                     (((x) >> 15) & 0x1)
+#define   C_008674_ME_WAITING_ON_REG_READ_DATA                        0xFFFF7FFF
+#define   S_008674_RCIU_WAITING_ON_GDS_FREE(x)                        (((x) & 0x1) << 23)
+#define   G_008674_RCIU_WAITING_ON_GDS_FREE(x)                        (((x) >> 23) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_GDS_FREE                           0xFF7FFFFF
+#define   S_008674_RCIU_WAITING_ON_GRBM_FREE(x)                       (((x) & 0x1) << 24)
+#define   G_008674_RCIU_WAITING_ON_GRBM_FREE(x)                       (((x) >> 24) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_GRBM_FREE                          0xFEFFFFFF
+#define   S_008674_RCIU_WAITING_ON_VGT_FREE(x)                        (((x) & 0x1) << 25)
+#define   G_008674_RCIU_WAITING_ON_VGT_FREE(x)                        (((x) >> 25) & 0x1)
+#define   C_008674_RCIU_WAITING_ON_VGT_FREE                           0xFDFFFFFF
+#define   S_008674_RCIU_STALLED_ON_ME_READ(x)                         (((x) & 0x1) << 26)
+#define   G_008674_RCIU_STALLED_ON_ME_READ(x)                         (((x) >> 26) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_ME_READ                            0xFBFFFFFF
+#define   S_008674_RCIU_STALLED_ON_DMA_READ(x)                        (((x) & 0x1) << 27)
+#define   G_008674_RCIU_STALLED_ON_DMA_READ(x)                        (((x) >> 27) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_DMA_READ                           0xF7FFFFFF
+#define   S_008674_RCIU_STALLED_ON_APPEND_READ(x)                     (((x) & 0x1) << 28)
+#define   G_008674_RCIU_STALLED_ON_APPEND_READ(x)                     (((x) >> 28) & 0x1)
+#define   C_008674_RCIU_STALLED_ON_APPEND_READ                        0xEFFFFFFF
+#define   S_008674_RCIU_HALTED_BY_REG_VIOLATION(x)                    (((x) & 0x1) << 29)
+#define   G_008674_RCIU_HALTED_BY_REG_VIOLATION(x)                    (((x) >> 29) & 0x1)
+#define   C_008674_RCIU_HALTED_BY_REG_VIOLATION                       0xDFFFFFFF
+#define R_008678_CP_STALLED_STAT2                                       0x008678
+#define   S_008678_PFP_TO_CSF_NOT_RDY_TO_RCV(x)                       (((x) & 0x1) << 0)
+#define   G_008678_PFP_TO_CSF_NOT_RDY_TO_RCV(x)                       (((x) >> 0) & 0x1)
+#define   C_008678_PFP_TO_CSF_NOT_RDY_TO_RCV                          0xFFFFFFFE
+#define   S_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV(x)                       (((x) & 0x1) << 1)
+#define   G_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV(x)                       (((x) >> 1) & 0x1)
+#define   C_008678_PFP_TO_MEQ_NOT_RDY_TO_RCV                          0xFFFFFFFD
+#define   S_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 2)
+#define   G_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV(x)                      (((x) >> 2) & 0x1)
+#define   C_008678_PFP_TO_RCIU_NOT_RDY_TO_RCV                         0xFFFFFFFB
+#define   S_008678_PFP_TO_VGT_WRITES_PENDING(x)                       (((x) & 0x1) << 4)
+#define   G_008678_PFP_TO_VGT_WRITES_PENDING(x)                       (((x) >> 4) & 0x1)
+#define   C_008678_PFP_TO_VGT_WRITES_PENDING                          0xFFFFFFEF
+#define   S_008678_PFP_RCIU_READ_PENDING(x)                           (((x) & 0x1) << 5)
+#define   G_008678_PFP_RCIU_READ_PENDING(x)                           (((x) >> 5) & 0x1)
+#define   C_008678_PFP_RCIU_READ_PENDING                              0xFFFFFFDF
+#define   S_008678_PFP_WAITING_ON_BUFFER_DATA(x)                      (((x) & 0x1) << 8)
+#define   G_008678_PFP_WAITING_ON_BUFFER_DATA(x)                      (((x) >> 8) & 0x1)
+#define   C_008678_PFP_WAITING_ON_BUFFER_DATA                         0xFFFFFEFF
+#define   S_008678_ME_WAIT_ON_CE_COUNTER(x)                           (((x) & 0x1) << 9)
+#define   G_008678_ME_WAIT_ON_CE_COUNTER(x)                           (((x) >> 9) & 0x1)
+#define   C_008678_ME_WAIT_ON_CE_COUNTER                              0xFFFFFDFF
+#define   S_008678_ME_WAIT_ON_AVAIL_BUFFER(x)                         (((x) & 0x1) << 10)
+#define   G_008678_ME_WAIT_ON_AVAIL_BUFFER(x)                         (((x) >> 10) & 0x1)
+#define   C_008678_ME_WAIT_ON_AVAIL_BUFFER                            0xFFFFFBFF
+#define   S_008678_GFX_CNTX_NOT_AVAIL_TO_ME(x)                        (((x) & 0x1) << 11)
+#define   G_008678_GFX_CNTX_NOT_AVAIL_TO_ME(x)                        (((x) >> 11) & 0x1)
+#define   C_008678_GFX_CNTX_NOT_AVAIL_TO_ME                           0xFFFFF7FF
+#define   S_008678_ME_RCIU_NOT_RDY_TO_RCV(x)                          (((x) & 0x1) << 12)
+#define   G_008678_ME_RCIU_NOT_RDY_TO_RCV(x)                          (((x) >> 12) & 0x1)
+#define   C_008678_ME_RCIU_NOT_RDY_TO_RCV                             0xFFFFEFFF
+#define   S_008678_ME_TO_CONST_NOT_RDY_TO_RCV(x)                      (((x) & 0x1) << 13)
+#define   G_008678_ME_TO_CONST_NOT_RDY_TO_RCV(x)                      (((x) >> 13) & 0x1)
+#define   C_008678_ME_TO_CONST_NOT_RDY_TO_RCV                         0xFFFFDFFF
+#define   S_008678_ME_WAITING_DATA_FROM_PFP(x)                        (((x) & 0x1) << 14)
+#define   G_008678_ME_WAITING_DATA_FROM_PFP(x)                        (((x) >> 14) & 0x1)
+#define   C_008678_ME_WAITING_DATA_FROM_PFP                           0xFFFFBFFF
+#define   S_008678_ME_WAITING_ON_PARTIAL_FLUSH(x)                     (((x) & 0x1) << 15)
+#define   G_008678_ME_WAITING_ON_PARTIAL_FLUSH(x)                     (((x) >> 15) & 0x1)
+#define   C_008678_ME_WAITING_ON_PARTIAL_FLUSH                        0xFFFF7FFF
+#define   S_008678_MEQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 16)
+#define   G_008678_MEQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) >> 16) & 0x1)
+#define   C_008678_MEQ_TO_ME_NOT_RDY_TO_RCV                           0xFFFEFFFF
+#define   S_008678_STQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) & 0x1) << 17)
+#define   G_008678_STQ_TO_ME_NOT_RDY_TO_RCV(x)                        (((x) >> 17) & 0x1)
+#define   C_008678_STQ_TO_ME_NOT_RDY_TO_RCV                           0xFFFDFFFF
+#define   S_008678_ME_WAITING_DATA_FROM_STQ(x)                        (((x) & 0x1) << 18)
+#define   G_008678_ME_WAITING_DATA_FROM_STQ(x)                        (((x) >> 18) & 0x1)
+#define   C_008678_ME_WAITING_DATA_FROM_STQ                           0xFFFBFFFF
+#define   S_008678_PFP_STALLED_ON_TC_WR_CONFIRM(x)                    (((x) & 0x1) << 19)
+#define   G_008678_PFP_STALLED_ON_TC_WR_CONFIRM(x)                    (((x) >> 19) & 0x1)
+#define   C_008678_PFP_STALLED_ON_TC_WR_CONFIRM                       0xFFF7FFFF
+#define   S_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA(x)                  (((x) & 0x1) << 20)
+#define   G_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA(x)                  (((x) >> 20) & 0x1)
+#define   C_008678_PFP_STALLED_ON_ATOMIC_RTN_DATA                     0xFFEFFFFF
+#define   S_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE(x)                     (((x) & 0x1) << 21)
+#define   G_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE(x)                     (((x) >> 21) & 0x1)
+#define   C_008678_EOPD_FIFO_NEEDS_SC_EOP_DONE                        0xFFDFFFFF
+#define   S_008678_EOPD_FIFO_NEEDS_WR_CONFIRM(x)                      (((x) & 0x1) << 22)
+#define   G_008678_EOPD_FIFO_NEEDS_WR_CONFIRM(x)                      (((x) >> 22) & 0x1)
+#define   C_008678_EOPD_FIFO_NEEDS_WR_CONFIRM                         0xFFBFFFFF
+#define   S_008678_STRMO_WR_OF_PRIM_DATA_PENDING(x)                   (((x) & 0x1) << 23)
+#define   G_008678_STRMO_WR_OF_PRIM_DATA_PENDING(x)                   (((x) >> 23) & 0x1)
+#define   C_008678_STRMO_WR_OF_PRIM_DATA_PENDING                      0xFF7FFFFF
+#define   S_008678_PIPE_STATS_WR_DATA_PENDING(x)                      (((x) & 0x1) << 24)
+#define   G_008678_PIPE_STATS_WR_DATA_PENDING(x)                      (((x) >> 24) & 0x1)
+#define   C_008678_PIPE_STATS_WR_DATA_PENDING                         0xFEFFFFFF
+#define   S_008678_APPEND_RDY_WAIT_ON_CS_DONE(x)                      (((x) & 0x1) << 25)
+#define   G_008678_APPEND_RDY_WAIT_ON_CS_DONE(x)                      (((x) >> 25) & 0x1)
+#define   C_008678_APPEND_RDY_WAIT_ON_CS_DONE                         0xFDFFFFFF
+#define   S_008678_APPEND_RDY_WAIT_ON_PS_DONE(x)                      (((x) & 0x1) << 26)
+#define   G_008678_APPEND_RDY_WAIT_ON_PS_DONE(x)                      (((x) >> 26) & 0x1)
+#define   C_008678_APPEND_RDY_WAIT_ON_PS_DONE                         0xFBFFFFFF
+#define   S_008678_APPEND_WAIT_ON_WR_CONFIRM(x)                       (((x) & 0x1) << 27)
+#define   G_008678_APPEND_WAIT_ON_WR_CONFIRM(x)                       (((x) >> 27) & 0x1)
+#define   C_008678_APPEND_WAIT_ON_WR_CONFIRM                          0xF7FFFFFF
+#define   S_008678_APPEND_ACTIVE_PARTITION(x)                         (((x) & 0x1) << 28)
+#define   G_008678_APPEND_ACTIVE_PARTITION(x)                         (((x) >> 28) & 0x1)
+#define   C_008678_APPEND_ACTIVE_PARTITION                            0xEFFFFFFF
+#define   S_008678_APPEND_WAITING_TO_SEND_MEMWRITE(x)                 (((x) & 0x1) << 29)
+#define   G_008678_APPEND_WAITING_TO_SEND_MEMWRITE(x)                 (((x) >> 29) & 0x1)
+#define   C_008678_APPEND_WAITING_TO_SEND_MEMWRITE                    0xDFFFFFFF
+#define   S_008678_SURF_SYNC_NEEDS_IDLE_CNTXS(x)                      (((x) & 0x1) << 30)
+#define   G_008678_SURF_SYNC_NEEDS_IDLE_CNTXS(x)                      (((x) >> 30) & 0x1)
+#define   C_008678_SURF_SYNC_NEEDS_IDLE_CNTXS                         0xBFFFFFFF
+#define   S_008678_SURF_SYNC_NEEDS_ALL_CLEAN(x)                       (((x) & 0x1) << 31)
+#define   G_008678_SURF_SYNC_NEEDS_ALL_CLEAN(x)                       (((x) >> 31) & 0x1)
+#define   C_008678_SURF_SYNC_NEEDS_ALL_CLEAN                          0x7FFFFFFF
+#define R_008680_CP_STAT                                                0x008680
+#define   S_008680_ROQ_RING_BUSY(x)                                   (((x) & 0x1) << 9)
+#define   G_008680_ROQ_RING_BUSY(x)                                   (((x) >> 9) & 0x1)
+#define   C_008680_ROQ_RING_BUSY                                      0xFFFFFDFF
+#define   S_008680_ROQ_INDIRECT1_BUSY(x)                              (((x) & 0x1) << 10)
+#define   G_008680_ROQ_INDIRECT1_BUSY(x)                              (((x) >> 10) & 0x1)
+#define   C_008680_ROQ_INDIRECT1_BUSY                                 0xFFFFFBFF
+#define   S_008680_ROQ_INDIRECT2_BUSY(x)                              (((x) & 0x1) << 11)
+#define   G_008680_ROQ_INDIRECT2_BUSY(x)                              (((x) >> 11) & 0x1)
+#define   C_008680_ROQ_INDIRECT2_BUSY                                 0xFFFFF7FF
+#define   S_008680_ROQ_STATE_BUSY(x)                                  (((x) & 0x1) << 12)
+#define   G_008680_ROQ_STATE_BUSY(x)                                  (((x) >> 12) & 0x1)
+#define   C_008680_ROQ_STATE_BUSY                                     0xFFFFEFFF
+#define   S_008680_DC_BUSY(x)                                         (((x) & 0x1) << 13)
+#define   G_008680_DC_BUSY(x)                                         (((x) >> 13) & 0x1)
+#define   C_008680_DC_BUSY                                            0xFFFFDFFF
+#define   S_008680_ATCL2IU_BUSY(x)                                    (((x) & 0x1) << 14)
+#define   G_008680_ATCL2IU_BUSY(x)                                    (((x) >> 14) & 0x1)
+#define   C_008680_ATCL2IU_BUSY                                       0xFFFFBFFF
+#define   S_008680_PFP_BUSY(x)                                        (((x) & 0x1) << 15)
+#define   G_008680_PFP_BUSY(x)                                        (((x) >> 15) & 0x1)
+#define   C_008680_PFP_BUSY                                           0xFFFF7FFF
+#define   S_008680_MEQ_BUSY(x)                                        (((x) & 0x1) << 16)
+#define   G_008680_MEQ_BUSY(x)                                        (((x) >> 16) & 0x1)
+#define   C_008680_MEQ_BUSY                                           0xFFFEFFFF
+#define   S_008680_ME_BUSY(x)                                         (((x) & 0x1) << 17)
+#define   G_008680_ME_BUSY(x)                                         (((x) >> 17) & 0x1)
+#define   C_008680_ME_BUSY                                            0xFFFDFFFF
+#define   S_008680_QUERY_BUSY(x)                                      (((x) & 0x1) << 18)
+#define   G_008680_QUERY_BUSY(x)                                      (((x) >> 18) & 0x1)
+#define   C_008680_QUERY_BUSY                                         0xFFFBFFFF
+#define   S_008680_SEMAPHORE_BUSY(x)                                  (((x) & 0x1) << 19)
+#define   G_008680_SEMAPHORE_BUSY(x)                                  (((x) >> 19) & 0x1)
+#define   C_008680_SEMAPHORE_BUSY                                     0xFFF7FFFF
+#define   S_008680_INTERRUPT_BUSY(x)                                  (((x) & 0x1) << 20)
+#define   G_008680_INTERRUPT_BUSY(x)                                  (((x) >> 20) & 0x1)
+#define   C_008680_INTERRUPT_BUSY                                     0xFFEFFFFF
+#define   S_008680_SURFACE_SYNC_BUSY(x)                               (((x) & 0x1) << 21)
+#define   G_008680_SURFACE_SYNC_BUSY(x)                               (((x) >> 21) & 0x1)
+#define   C_008680_SURFACE_SYNC_BUSY                                  0xFFDFFFFF
+#define   S_008680_DMA_BUSY(x)                                        (((x) & 0x1) << 22)
+#define   G_008680_DMA_BUSY(x)                                        (((x) >> 22) & 0x1)
+#define   C_008680_DMA_BUSY                                           0xFFBFFFFF
+#define   S_008680_RCIU_BUSY(x)                                       (((x) & 0x1) << 23)
+#define   G_008680_RCIU_BUSY(x)                                       (((x) >> 23) & 0x1)
+#define   C_008680_RCIU_BUSY                                          0xFF7FFFFF
+#define   S_008680_SCRATCH_RAM_BUSY(x)                                (((x) & 0x1) << 24)
+#define   G_008680_SCRATCH_RAM_BUSY(x)                                (((x) >> 24) & 0x1)
+#define   C_008680_SCRATCH_RAM_BUSY                                   0xFEFFFFFF
+#define   S_008680_CPC_CPG_BUSY(x)                                    (((x) & 0x1) << 25)
+#define   G_008680_CPC_CPG_BUSY(x)                                    (((x) >> 25) & 0x1)
+#define   C_008680_CPC_CPG_BUSY                                       0xFDFFFFFF
+#define   S_008680_CE_BUSY(x)                                         (((x) & 0x1) << 26)
+#define   G_008680_CE_BUSY(x)                                         (((x) >> 26) & 0x1)
+#define   C_008680_CE_BUSY                                            0xFBFFFFFF
+#define   S_008680_TCIU_BUSY(x)                                       (((x) & 0x1) << 27)
+#define   G_008680_TCIU_BUSY(x)                                       (((x) >> 27) & 0x1)
+#define   C_008680_TCIU_BUSY                                          0xF7FFFFFF
+#define   S_008680_ROQ_CE_RING_BUSY(x)                                (((x) & 0x1) << 28)
+#define   G_008680_ROQ_CE_RING_BUSY(x)                                (((x) >> 28) & 0x1)
+#define   C_008680_ROQ_CE_RING_BUSY                                   0xEFFFFFFF
+#define   S_008680_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) & 0x1) << 29)
+#define   G_008680_ROQ_CE_INDIRECT1_BUSY(x)                           (((x) >> 29) & 0x1)
+#define   C_008680_ROQ_CE_INDIRECT1_BUSY                              0xDFFFFFFF
+#define   S_008680_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) & 0x1) << 30)
+#define   G_008680_ROQ_CE_INDIRECT2_BUSY(x)                           (((x) >> 30) & 0x1)
+#define   C_008680_ROQ_CE_INDIRECT2_BUSY                              0xBFFFFFFF
+#define   S_008680_CP_BUSY(x)                                         (((x) & 0x1) << 31)
+#define   G_008680_CP_BUSY(x)                                         (((x) >> 31) & 0x1)
+#define   C_008680_CP_BUSY                                            0x7FFFFFFF
 /* CIK */
+#define R_030800_GRBM_GFX_INDEX                                         0x030800
+#define   S_030800_INSTANCE_INDEX(x)                                  (((x) & 0xFF) << 0)
+#define   G_030800_INSTANCE_INDEX(x)                                  (((x) >> 0) & 0xFF)
+#define   C_030800_INSTANCE_INDEX                                     0xFFFFFF00
+#define   S_030800_SH_INDEX(x)                                        (((x) & 0xFF) << 8)
+#define   G_030800_SH_INDEX(x)                                        (((x) >> 8) & 0xFF)
+#define   C_030800_SH_INDEX                                           0xFFFF00FF
+#define   S_030800_SE_INDEX(x)                                        (((x) & 0xFF) << 16)
+#define   G_030800_SE_INDEX(x)                                        (((x) >> 16) & 0xFF)
+#define   C_030800_SE_INDEX                                           0xFF00FFFF
+#define   S_030800_SH_BROADCAST_WRITES(x)                             (((x) & 0x1) << 29)
+#define   G_030800_SH_BROADCAST_WRITES(x)                             (((x) >> 29) & 0x1)
+#define   C_030800_SH_BROADCAST_WRITES                                0xDFFFFFFF
+#define   S_030800_INSTANCE_BROADCAST_WRITES(x)                       (((x) & 0x1) << 30)
+#define   G_030800_INSTANCE_BROADCAST_WRITES(x)                       (((x) >> 30) & 0x1)
+#define   C_030800_INSTANCE_BROADCAST_WRITES                          0xBFFFFFFF
+#define   S_030800_SE_BROADCAST_WRITES(x)                             (((x) & 0x1) << 31)
+#define   G_030800_SE_BROADCAST_WRITES(x)                             (((x) >> 31) & 0x1)
+#define   C_030800_SE_BROADCAST_WRITES                                0x7FFFFFFF
+#define R_030900_VGT_ESGS_RING_SIZE                                     0x030900
+#define R_030904_VGT_GSVS_RING_SIZE                                     0x030904
 #define R_030908_VGT_PRIMITIVE_TYPE                                     0x030908
 #define   S_030908_PRIM_TYPE(x)                                       (((x) & 0x3F) << 0)
 #define   G_030908_PRIM_TYPE(x)                                       (((x) >> 0) & 0x3F)
@@ -530,6 +1736,34 @@
 #define   S_030A04_CURRENT_COUNT(x)                                   (((x) & 0xFF) << 8)
 #define   G_030A04_CURRENT_COUNT(x)                                   (((x) >> 8) & 0xFF)
 #define   C_030A04_CURRENT_COUNT                                      0xFFFF00FF
+#define R_030A10_PA_SC_SCREEN_EXTENT_MIN_0                              0x030A10
+#define   S_030A10_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A10_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A10_X                                                  0xFFFF0000
+#define   S_030A10_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A10_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A10_Y                                                  0x0000FFFF
+#define R_030A14_PA_SC_SCREEN_EXTENT_MAX_0                              0x030A14
+#define   S_030A14_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A14_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A14_X                                                  0xFFFF0000
+#define   S_030A14_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A14_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A14_Y                                                  0x0000FFFF
+#define R_030A18_PA_SC_SCREEN_EXTENT_MIN_1                              0x030A18
+#define   S_030A18_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A18_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A18_X                                                  0xFFFF0000
+#define   S_030A18_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A18_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A18_Y                                                  0x0000FFFF
+#define R_030A2C_PA_SC_SCREEN_EXTENT_MAX_1                              0x030A2C
+#define   S_030A2C_X(x)                                               (((x) & 0xFFFF) << 0)
+#define   G_030A2C_X(x)                                               (((x) >> 0) & 0xFFFF)
+#define   C_030A2C_X                                                  0xFFFF0000
+#define   S_030A2C_Y(x)                                               (((x) & 0xFFFF) << 16)
+#define   G_030A2C_Y(x)                                               (((x) >> 16) & 0xFFFF)
+#define   C_030A2C_Y                                                  0x0000FFFF
 /*     */
 #define R_008BF0_PA_SC_ENHANCE                                          0x008BF0
 #define   S_008BF0_ENABLE_PA_SC_OUT_OF_ORDER(x)                       (((x) & 0x1) << 0)
@@ -608,6 +1842,32 @@
 #define     V_008DFC_SQ_VGPR                                        0x00
 /*     */
 #define R_008DFC_SQ_INST                                                0x008DFC
+#define R_030D20_SQC_CACHES                                             0x030D20
+#define   S_030D20_TARGET_INST(x)                                     (((x) & 0x1) << 0)
+#define   G_030D20_TARGET_INST(x)                                     (((x) >> 0) & 0x1)
+#define   C_030D20_TARGET_INST                                        0xFFFFFFFE
+#define   S_030D20_TARGET_DATA(x)                                     (((x) & 0x1) << 1)
+#define   G_030D20_TARGET_DATA(x)                                     (((x) >> 1) & 0x1)
+#define   C_030D20_TARGET_DATA                                        0xFFFFFFFD
+#define   S_030D20_INVALIDATE(x)                                      (((x) & 0x1) << 2)
+#define   G_030D20_INVALIDATE(x)                                      (((x) >> 2) & 0x1)
+#define   C_030D20_INVALIDATE                                         0xFFFFFFFB
+#define   S_030D20_WRITEBACK(x)                                       (((x) & 0x1) << 3)
+#define   G_030D20_WRITEBACK(x)                                       (((x) >> 3) & 0x1)
+#define   C_030D20_WRITEBACK                                          0xFFFFFFF7
+#define   S_030D20_VOL(x)                                             (((x) & 0x1) << 4)
+#define   G_030D20_VOL(x)                                             (((x) >> 4) & 0x1)
+#define   C_030D20_VOL                                                0xFFFFFFEF
+#define   S_030D20_COMPLETE(x)                                        (((x) & 0x1) << 16)
+#define   G_030D20_COMPLETE(x)                                        (((x) >> 16) & 0x1)
+#define   C_030D20_COMPLETE                                           0xFFFEFFFF
+#define R_030D24_SQC_WRITEBACK                                          0x030D24
+#define   S_030D24_DWB(x)                                             (((x) & 0x1) << 0)
+#define   G_030D24_DWB(x)                                             (((x) >> 0) & 0x1)
+#define   C_030D24_DWB                                                0xFFFFFFFE
+#define   S_030D24_DIRTY(x)                                           (((x) & 0x1) << 1)
+#define   G_030D24_DIRTY(x)                                           (((x) >> 1) & 0x1)
+#define   C_030D24_DIRTY                                              0xFFFFFFFD
 #define R_008DFC_SQ_VOP1                                                0x008DFC
 #define   S_008DFC_SRC0(x)                                            (((x) & 0x1FF) << 0)
 #define   G_008DFC_SRC0(x)                                            (((x) >> 0) & 0x1FF)
@@ -3740,7 +5000,17 @@
 #define   C_008DFC_ENCODING                                           0x03FFFFFF
 #define     V_008DFC_SQ_ENC_MUBUF_FIELD                             0x38
 #endif
+#define R_030E00_TA_CS_BC_BASE_ADDR                                     0x030E00
+#define R_030E04_TA_CS_BC_BASE_ADDR_HI                                  0x030E04
+#define   S_030E04_ADDRESS(x)                                         (((x) & 0xFF) << 0)
+#define   G_030E04_ADDRESS(x)                                         (((x) >> 0) & 0xFF)
+#define   C_030E04_ADDRESS                                            0xFFFFFF00
+#define R_030F00_DB_OCCLUSION_COUNT0_LOW                                0x030F00
 #define R_008F00_SQ_BUF_RSRC_WORD0                                      0x008F00
+#define R_030F04_DB_OCCLUSION_COUNT0_HI                                 0x030F04
+#define   S_030F04_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F04_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F04_COUNT_HI                                           0x80000000
 #define R_008F04_SQ_BUF_RSRC_WORD1                                      0x008F04
 #define   S_008F04_BASE_ADDRESS_HI(x)                                 (((x) & 0xFFFF) << 0)
 #define   G_008F04_BASE_ADDRESS_HI(x)                                 (((x) >> 0) & 0xFFFF)
@@ -3754,7 +5024,12 @@
 #define   S_008F04_SWIZZLE_ENABLE(x)                                  (((x) & 0x1) << 31)
 #define   G_008F04_SWIZZLE_ENABLE(x)                                  (((x) >> 31) & 0x1)
 #define   C_008F04_SWIZZLE_ENABLE                                     0x7FFFFFFF
+#define R_030F08_DB_OCCLUSION_COUNT1_LOW                                0x030F08
 #define R_008F08_SQ_BUF_RSRC_WORD2                                      0x008F08
+#define R_030F0C_DB_OCCLUSION_COUNT1_HI                                 0x030F0C
+#define   S_030F0C_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F0C_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F0C_COUNT_HI                                           0x80000000
 #define R_008F0C_SQ_BUF_RSRC_WORD3                                      0x008F0C
 #define   S_008F0C_DST_SEL_X(x)                                       (((x) & 0x07) << 0)
 #define   G_008F0C_DST_SEL_X(x)                                       (((x) >> 0) & 0x07)
@@ -3862,7 +5137,12 @@
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_1                             0x01
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_2                             0x02
 #define     V_008F0C_SQ_RSRC_BUF_RSVD_3                             0x03
+#define R_030F10_DB_OCCLUSION_COUNT2_LOW                                0x030F10
 #define R_008F10_SQ_IMG_RSRC_WORD0                                      0x008F10
+#define R_030F14_DB_OCCLUSION_COUNT2_HI                                 0x030F14
+#define   S_030F14_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F14_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F14_COUNT_HI                                           0x80000000
 #define R_008F14_SQ_IMG_RSRC_WORD1                                      0x008F14
 #define   S_008F14_BASE_ADDRESS_HI(x)                                 (((x) & 0xFF) << 0)
 #define   G_008F14_BASE_ADDRESS_HI(x)                                 (((x) >> 0) & 0xFF)
@@ -3961,6 +5241,7 @@
 #define   G_008F14_MTYPE(x)                                           (((x) >> 30) & 0x03)
 #define   C_008F14_MTYPE                                              0x3FFFFFFF
 /*     */
+#define R_030F18_DB_OCCLUSION_COUNT3_LOW                                0x030F18
 #define R_008F18_SQ_IMG_RSRC_WORD2                                      0x008F18
 #define   S_008F18_WIDTH(x)                                           (((x) & 0x3FFF) << 0)
 #define   G_008F18_WIDTH(x)                                           (((x) >> 0) & 0x3FFF)
@@ -3974,6 +5255,10 @@
 #define   S_008F18_INTERLACED(x)                                      (((x) & 0x1) << 31)
 #define   G_008F18_INTERLACED(x)                                      (((x) >> 31) & 0x1)
 #define   C_008F18_INTERLACED                                         0x7FFFFFFF
+#define R_030F1C_DB_OCCLUSION_COUNT3_HI                                 0x030F1C
+#define   S_030F1C_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030F1C_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030F1C_COUNT_HI                                           0x80000000
 #define R_008F1C_SQ_IMG_RSRC_WORD3                                      0x008F1C
 #define   S_008F1C_DST_SEL_X(x)                                       (((x) & 0x07) << 0)
 #define   G_008F1C_DST_SEL_X(x)                                       (((x) >> 0) & 0x07)
@@ -4084,6 +5369,23 @@
 #define   G_008F28_LOD_HDW_CNT_EN(x)                                  (((x) >> 20) & 0x1)
 #define   C_008F28_LOD_HDW_CNT_EN                                     0xFFEFFFFF
 /*     */
+/* VI */
+#define   S_008F28_COMPRESSION_EN(x)                                  (((x) & 0x1) << 21)
+#define   G_008F28_COMPRESSION_EN(x)                                  (((x) >> 21) & 0x1)
+#define   C_008F28_COMPRESSION_EN                                     0xFFDFFFFF
+#define   S_008F28_ALPHA_IS_ON_MSB(x)                                 (((x) & 0x1) << 22)
+#define   G_008F28_ALPHA_IS_ON_MSB(x)                                 (((x) >> 22) & 0x1)
+#define   C_008F28_ALPHA_IS_ON_MSB                                    0xFFBFFFFF
+#define   S_008F28_COLOR_TRANSFORM(x)                                 (((x) & 0x1) << 23)
+#define   G_008F28_COLOR_TRANSFORM(x)                                 (((x) >> 23) & 0x1)
+#define   C_008F28_COLOR_TRANSFORM                                    0xFF7FFFFF
+#define   S_008F28_LOST_ALPHA_BITS(x)                                 (((x) & 0x0F) << 24)
+#define   G_008F28_LOST_ALPHA_BITS(x)                                 (((x) >> 24) & 0x0F)
+#define   C_008F28_LOST_ALPHA_BITS                                    0xF0FFFFFF
+#define   S_008F28_LOST_COLOR_BITS(x)                                 (((x) & 0x0F) << 28)
+#define   G_008F28_LOST_COLOR_BITS(x)                                 (((x) >> 28) & 0x0F)
+#define   C_008F28_LOST_COLOR_BITS                                    0x0FFFFFFF
+/*    */
 #define R_008F2C_SQ_IMG_RSRC_WORD7                                      0x008F2C
 #define R_008F30_SQ_IMG_SAMP_WORD0                                      0x008F30
 #define   S_008F30_CLAMP_X(x)                                         (((x) & 0x07) << 0)
@@ -4148,6 +5450,11 @@
 #define   S_008F30_FILTER_MODE(x)                                     (((x) & 0x03) << 29)
 #define   G_008F30_FILTER_MODE(x)                                     (((x) >> 29) & 0x03)
 #define   C_008F30_FILTER_MODE                                        0x9FFFFFFF
+/* VI */
+#define   S_008F30_COMPAT_MODE(x)                                     (((x) & 0x1) << 31)
+#define   G_008F30_COMPAT_MODE(x)                                     (((x) >> 31) & 0x1)
+#define   C_008F30_COMPAT_MODE                                        0x7FFFFFFF
+/*    */
 #define R_008F34_SQ_IMG_SAMP_WORD1                                      0x008F34
 #define   S_008F34_MIN_LOD(x)                                         (((x) & 0xFFF) << 0)
 #define   G_008F34_MIN_LOD(x)                                         (((x) >> 0) & 0xFFF)
@@ -4313,6 +5620,11 @@
 #define   G_008F44_OFFSET(x)                                          (((x) >> 0) & 0xFFFFFF)
 #define   C_008F44_OFFSET                                             0xFF000000
 /*     */
+#define R_030FF8_DB_ZPASS_COUNT_LOW                                     0x030FF8
+#define R_030FFC_DB_ZPASS_COUNT_HI                                      0x030FFC
+#define   S_030FFC_COUNT_HI(x)                                        (((x) & 0x7FFFFFFF) << 0)
+#define   G_030FFC_COUNT_HI(x)                                        (((x) >> 0) & 0x7FFFFFFF)
+#define   C_030FFC_COUNT_HI                                           0x80000000
 #define R_009100_SPI_CONFIG_CNTL                                        0x009100
 #define   S_009100_GPR_WRITE_PRIORITY(x)                              (((x) & 0x1FFFFF) << 0)
 #define   G_009100_GPR_WRITE_PRIORITY(x)                              (((x) >> 0) & 0x1FFFFF)
@@ -4437,6 +5749,34 @@
 #define   S_009858_MSAA16_Y(x)                                        (((x) & 0x03) << 18)
 #define   G_009858_MSAA16_Y(x)                                        (((x) >> 18) & 0x03)
 #define   C_009858_MSAA16_Y                                           0xFFF3FFFF
+#define R_0098F8_GB_ADDR_CONFIG                                         0x0098F8
+#define   S_0098F8_NUM_PIPES(x)                                       (((x) & 0x07) << 0)
+#define   G_0098F8_NUM_PIPES(x)                                       (((x) >> 0) & 0x07)
+#define   C_0098F8_NUM_PIPES                                          0xFFFFFFF8
+#define   S_0098F8_PIPE_INTERLEAVE_SIZE(x)                            (((x) & 0x07) << 4)
+#define   G_0098F8_PIPE_INTERLEAVE_SIZE(x)                            (((x) >> 4) & 0x07)
+#define   C_0098F8_PIPE_INTERLEAVE_SIZE                               0xFFFFFF8F
+#define   S_0098F8_BANK_INTERLEAVE_SIZE(x)                            (((x) & 0x07) << 8)
+#define   G_0098F8_BANK_INTERLEAVE_SIZE(x)                            (((x) >> 8) & 0x07)
+#define   C_0098F8_BANK_INTERLEAVE_SIZE                               0xFFFFF8FF
+#define   S_0098F8_NUM_SHADER_ENGINES(x)                              (((x) & 0x03) << 12)
+#define   G_0098F8_NUM_SHADER_ENGINES(x)                              (((x) >> 12) & 0x03)
+#define   C_0098F8_NUM_SHADER_ENGINES                                 0xFFFFCFFF
+#define   S_0098F8_SHADER_ENGINE_TILE_SIZE(x)                         (((x) & 0x07) << 16)
+#define   G_0098F8_SHADER_ENGINE_TILE_SIZE(x)                         (((x) >> 16) & 0x07)
+#define   C_0098F8_SHADER_ENGINE_TILE_SIZE                            0xFFF8FFFF
+#define   S_0098F8_NUM_GPUS(x)                                        (((x) & 0x07) << 20)
+#define   G_0098F8_NUM_GPUS(x)                                        (((x) >> 20) & 0x07)
+#define   C_0098F8_NUM_GPUS                                           0xFF8FFFFF
+#define   S_0098F8_MULTI_GPU_TILE_SIZE(x)                             (((x) & 0x03) << 24)
+#define   G_0098F8_MULTI_GPU_TILE_SIZE(x)                             (((x) >> 24) & 0x03)
+#define   C_0098F8_MULTI_GPU_TILE_SIZE                                0xFCFFFFFF
+#define   S_0098F8_ROW_SIZE(x)                                        (((x) & 0x03) << 28)
+#define   G_0098F8_ROW_SIZE(x)                                        (((x) >> 28) & 0x03)
+#define   C_0098F8_ROW_SIZE                                           0xCFFFFFFF
+#define   S_0098F8_NUM_LOWER_PIPES(x)                                 (((x) & 0x1) << 30)
+#define   G_0098F8_NUM_LOWER_PIPES(x)                                 (((x) >> 30) & 0x1)
+#define   C_0098F8_NUM_LOWER_PIPES                                    0xBFFFFFFF
 #define R_009910_GB_TILE_MODE0                                          0x009910
 #define   S_009910_MICRO_TILE_MODE(x)                                 (((x) & 0x03) << 0)
 #define   G_009910_MICRO_TILE_MODE(x)                                 (((x) >> 0) & 0x03)
@@ -4515,14 +5855,88 @@
 #define     V_009910_ADDR_SURF_4_BANK                               0x01
 #define     V_009910_ADDR_SURF_8_BANK                               0x02
 #define     V_009910_ADDR_SURF_16_BANK                              0x03
-/* CIK */
 #define   S_009910_MICRO_TILE_MODE_NEW(x)                             (((x) & 0x07) << 22)
 #define   G_009910_MICRO_TILE_MODE_NEW(x)                             (((x) >> 22) & 0x07)
-#define   C_009910_MICRO_TILE_MODE_NEW(x)                             0xFE3FFFFF
+#define   C_009910_MICRO_TILE_MODE_NEW                                0xFE3FFFFF
 #define     V_009910_ADDR_SURF_DISPLAY_MICRO_TILING                 0x00
 #define     V_009910_ADDR_SURF_THIN_MICRO_TILING                    0x01
 #define     V_009910_ADDR_SURF_DEPTH_MICRO_TILING                   0x02
 #define     V_009910_ADDR_SURF_ROTATED_MICRO_TILING                 0x03
+#define   S_009910_SAMPLE_SPLIT(x)                                    (((x) & 0x03) << 25)
+#define   G_009910_SAMPLE_SPLIT(x)                                    (((x) >> 25) & 0x03)
+#define   C_009910_SAMPLE_SPLIT                                       0xF9FFFFFF
+#define R_009914_GB_TILE_MODE1                                          0x009914
+#define R_009918_GB_TILE_MODE2                                          0x009918
+#define R_00991C_GB_TILE_MODE3                                          0x00991C
+#define R_009920_GB_TILE_MODE4                                          0x009920
+#define R_009924_GB_TILE_MODE5                                          0x009924
+#define R_009928_GB_TILE_MODE6                                          0x009928
+#define R_00992C_GB_TILE_MODE7                                          0x00992C
+#define R_009930_GB_TILE_MODE8                                          0x009930
+#define R_009934_GB_TILE_MODE9                                          0x009934
+#define R_009938_GB_TILE_MODE10                                         0x009938
+#define R_00993C_GB_TILE_MODE11                                         0x00993C
+#define R_009940_GB_TILE_MODE12                                         0x009940
+#define R_009944_GB_TILE_MODE13                                         0x009944
+#define R_009948_GB_TILE_MODE14                                         0x009948
+#define R_00994C_GB_TILE_MODE15                                         0x00994C
+#define R_009950_GB_TILE_MODE16                                         0x009950
+#define R_009954_GB_TILE_MODE17                                         0x009954
+#define R_009958_GB_TILE_MODE18                                         0x009958
+#define R_00995C_GB_TILE_MODE19                                         0x00995C
+#define R_009960_GB_TILE_MODE20                                         0x009960
+#define R_009964_GB_TILE_MODE21                                         0x009964
+#define R_009968_GB_TILE_MODE22                                         0x009968
+#define R_00996C_GB_TILE_MODE23                                         0x00996C
+#define R_009970_GB_TILE_MODE24                                         0x009970
+#define R_009974_GB_TILE_MODE25                                         0x009974
+#define R_009978_GB_TILE_MODE26                                         0x009978
+#define R_00997C_GB_TILE_MODE27                                         0x00997C
+#define R_009980_GB_TILE_MODE28                                         0x009980
+#define R_009984_GB_TILE_MODE29                                         0x009984
+#define R_009988_GB_TILE_MODE30                                         0x009988
+#define R_00998C_GB_TILE_MODE31                                         0x00998C
+/* CIK */
+#define R_009990_GB_MACROTILE_MODE0                                     0x009990
+#define   S_009990_BANK_WIDTH(x)                                      (((x) & 0x03) << 0)
+#define   G_009990_BANK_WIDTH(x)                                      (((x) >> 0) & 0x03)
+#define   C_009990_BANK_WIDTH                                         0xFFFFFFFC
+#define   S_009990_BANK_HEIGHT(x)                                     (((x) & 0x03) << 2)
+#define   G_009990_BANK_HEIGHT(x)                                     (((x) >> 2) & 0x03)
+#define   C_009990_BANK_HEIGHT                                        0xFFFFFFF3
+#define   S_009990_MACRO_TILE_ASPECT(x)                               (((x) & 0x03) << 4)
+#define   G_009990_MACRO_TILE_ASPECT(x)                               (((x) >> 4) & 0x03)
+#define   C_009990_MACRO_TILE_ASPECT                                  0xFFFFFFCF
+#define   S_009990_NUM_BANKS(x)                                       (((x) & 0x03) << 6)
+#define   G_009990_NUM_BANKS(x)                                       (((x) >> 6) & 0x03)
+#define   C_009990_NUM_BANKS                                          0xFFFFFF3F
+#define R_009994_GB_MACROTILE_MODE1                                     0x009994
+#define R_009998_GB_MACROTILE_MODE2                                     0x009998
+#define R_00999C_GB_MACROTILE_MODE3                                     0x00999C
+#define R_0099A0_GB_MACROTILE_MODE4                                     0x0099A0
+#define R_0099A4_GB_MACROTILE_MODE5                                     0x0099A4
+#define R_0099A8_GB_MACROTILE_MODE6                                     0x0099A8
+#define R_0099AC_GB_MACROTILE_MODE7                                     0x0099AC
+#define R_0099B0_GB_MACROTILE_MODE8                                     0x0099B0
+#define R_0099B4_GB_MACROTILE_MODE9                                     0x0099B4
+#define R_0099B8_GB_MACROTILE_MODE10                                    0x0099B8
+#define R_0099BC_GB_MACROTILE_MODE11                                    0x0099BC
+#define R_0099C0_GB_MACROTILE_MODE12                                    0x0099C0
+#define R_0099C4_GB_MACROTILE_MODE13                                    0x0099C4
+#define R_0099C8_GB_MACROTILE_MODE14                                    0x0099C8
+#define R_0099CC_GB_MACROTILE_MODE15                                    0x0099CC
+/*     */
+#define R_00B000_SPI_SHADER_TBA_LO_PS                                   0x00B000
+#define R_00B004_SPI_SHADER_TBA_HI_PS                                   0x00B004
+#define   S_00B004_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B004_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B004_MEM_BASE                                           0xFFFFFF00
+#define R_00B008_SPI_SHADER_TMA_LO_PS                                   0x00B008
+#define R_00B00C_SPI_SHADER_TMA_HI_PS                                   0x00B00C
+#define   S_00B00C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B00C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B00C_MEM_BASE                                           0xFFFFFF00
+/* CIK */
 #define R_00B01C_SPI_SHADER_PGM_RSRC3_PS                                0x00B01C
 #define   S_00B01C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
 #define   G_00B01C_CU_EN(x)                                           (((x) >> 0) & 0xFFFF)
@@ -4582,6 +5996,9 @@
 #define   S_00B02C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B02C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B02C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B02C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B02C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B02C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B02C_WAVE_CNT_EN(x)                                     (((x) & 0x1) << 7)
 #define   G_00B02C_WAVE_CNT_EN(x)                                     (((x) >> 7) & 0x1)
 #define   C_00B02C_WAVE_CNT_EN                                        0xFFFFFF7F
@@ -4591,6 +6008,9 @@
 #define   S_00B02C_EXCP_EN(x)                                         (((x) & 0x7F) << 16) /* mask is 0x1FF on CIK */
 #define   G_00B02C_EXCP_EN(x)                                         (((x) >> 16) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B02C_EXCP_EN                                            0xFF80FFFF /* mask is 0x1FF on CIK */
+#define   S_00B02C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 16)
+#define   G_00B02C_EXCP_EN_CIK(x)                                     (((x) >> 16) & 0x1FF)
+#define   C_00B02C_EXCP_EN_CIK                                        0xFE00FFFF
 #define R_00B030_SPI_SHADER_USER_DATA_PS_0                              0x00B030
 #define R_00B034_SPI_SHADER_USER_DATA_PS_1                              0x00B034
 #define R_00B038_SPI_SHADER_USER_DATA_PS_2                              0x00B038
@@ -4607,6 +6027,16 @@
 #define R_00B064_SPI_SHADER_USER_DATA_PS_13                             0x00B064
 #define R_00B068_SPI_SHADER_USER_DATA_PS_14                             0x00B068
 #define R_00B06C_SPI_SHADER_USER_DATA_PS_15                             0x00B06C
+#define R_00B100_SPI_SHADER_TBA_LO_VS                                   0x00B100
+#define R_00B104_SPI_SHADER_TBA_HI_VS                                   0x00B104
+#define   S_00B104_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B104_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B104_MEM_BASE                                           0xFFFFFF00
+#define R_00B108_SPI_SHADER_TMA_LO_VS                                   0x00B108
+#define R_00B10C_SPI_SHADER_TMA_HI_VS                                   0x00B10C
+#define   S_00B10C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B10C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B10C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B118_SPI_SHADER_PGM_RSRC3_VS                                0x00B118
 #define   S_00B118_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4674,6 +6104,9 @@
 #define   S_00B12C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B12C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B12C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B12C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B12C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B12C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B12C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B12C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B12C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4695,6 +6128,14 @@
 #define   S_00B12C_EXCP_EN(x)                                         (((x) & 0x7F) << 13) /* mask is 0x1FF on CIK */
 #define   G_00B12C_EXCP_EN(x)                                         (((x) >> 13) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B12C_EXCP_EN                                            0xFFF01FFF /* mask is 0x1FF on CIK */
+#define   S_00B12C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 13)
+#define   G_00B12C_EXCP_EN_CIK(x)                                     (((x) >> 13) & 0x1FF)
+#define   C_00B12C_EXCP_EN_CIK                                        0xFFC01FFF
+/* VI */
+#define   S_00B12C_DISPATCH_DRAW_EN(x)                                (((x) & 0x1) << 24)
+#define   G_00B12C_DISPATCH_DRAW_EN(x)                                (((x) >> 24) & 0x1)
+#define   C_00B12C_DISPATCH_DRAW_EN                                   0xFEFFFFFF
+/*    */
 #define R_00B130_SPI_SHADER_USER_DATA_VS_0                              0x00B130
 #define R_00B134_SPI_SHADER_USER_DATA_VS_1                              0x00B134
 #define R_00B138_SPI_SHADER_USER_DATA_VS_2                              0x00B138
@@ -4711,6 +6152,16 @@
 #define R_00B164_SPI_SHADER_USER_DATA_VS_13                             0x00B164
 #define R_00B168_SPI_SHADER_USER_DATA_VS_14                             0x00B168
 #define R_00B16C_SPI_SHADER_USER_DATA_VS_15                             0x00B16C
+#define R_00B200_SPI_SHADER_TBA_LO_GS                                   0x00B200
+#define R_00B204_SPI_SHADER_TBA_HI_GS                                   0x00B204
+#define   S_00B204_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B204_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B204_MEM_BASE                                           0xFFFFFF00
+#define R_00B208_SPI_SHADER_TMA_LO_GS                                   0x00B208
+#define R_00B20C_SPI_SHADER_TMA_HI_GS                                   0x00B20C
+#define   S_00B20C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B20C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B20C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B21C_SPI_SHADER_PGM_RSRC3_GS                                0x00B21C
 #define   S_00B21C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4723,6 +6174,11 @@
 #define   G_00B21C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B21C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B21C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B21C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B21C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B220_SPI_SHADER_PGM_LO_GS                                   0x00B220
 #define R_00B224_SPI_SHADER_PGM_HI_GS                                   0x00B224
 #define   S_00B224_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4771,10 +6227,41 @@
 #define   S_00B22C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B22C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B22C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B22C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B22C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B22C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B22C_EXCP_EN(x)                                         (((x) & 0x7F) << 7) /* mask is 0x1FF on CIK */
 #define   G_00B22C_EXCP_EN(x)                                         (((x) >> 7) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B22C_EXCP_EN                                            0xFFFFC07F /* mask is 0x1FF on CIK */
+#define   S_00B22C_EXCP_EN_CIK(x)                                     (((x) & 0x1FF) << 7)
+#define   G_00B22C_EXCP_EN_CIK(x)                                     (((x) >> 7) & 0x1FF)
+#define   C_00B22C_EXCP_EN_CIK                                        0xFFFF007F
 #define R_00B230_SPI_SHADER_USER_DATA_GS_0                              0x00B230
+#define R_00B234_SPI_SHADER_USER_DATA_GS_1                              0x00B234
+#define R_00B238_SPI_SHADER_USER_DATA_GS_2                              0x00B238
+#define R_00B23C_SPI_SHADER_USER_DATA_GS_3                              0x00B23C
+#define R_00B240_SPI_SHADER_USER_DATA_GS_4                              0x00B240
+#define R_00B244_SPI_SHADER_USER_DATA_GS_5                              0x00B244
+#define R_00B248_SPI_SHADER_USER_DATA_GS_6                              0x00B248
+#define R_00B24C_SPI_SHADER_USER_DATA_GS_7                              0x00B24C
+#define R_00B250_SPI_SHADER_USER_DATA_GS_8                              0x00B250
+#define R_00B254_SPI_SHADER_USER_DATA_GS_9                              0x00B254
+#define R_00B258_SPI_SHADER_USER_DATA_GS_10                             0x00B258
+#define R_00B25C_SPI_SHADER_USER_DATA_GS_11                             0x00B25C
+#define R_00B260_SPI_SHADER_USER_DATA_GS_12                             0x00B260
+#define R_00B264_SPI_SHADER_USER_DATA_GS_13                             0x00B264
+#define R_00B268_SPI_SHADER_USER_DATA_GS_14                             0x00B268
+#define R_00B26C_SPI_SHADER_USER_DATA_GS_15                             0x00B26C
+#define R_00B300_SPI_SHADER_TBA_LO_ES                                   0x00B300
+#define R_00B304_SPI_SHADER_TBA_HI_ES                                   0x00B304
+#define   S_00B304_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B304_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B304_MEM_BASE                                           0xFFFFFF00
+#define R_00B308_SPI_SHADER_TMA_LO_ES                                   0x00B308
+#define R_00B30C_SPI_SHADER_TMA_HI_ES                                   0x00B30C
+#define   S_00B30C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B30C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B30C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B31C_SPI_SHADER_PGM_RSRC3_ES                                0x00B31C
 #define   S_00B31C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4787,6 +6274,11 @@
 #define   G_00B31C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B31C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B31C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B31C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B31C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B320_SPI_SHADER_PGM_LO_ES                                   0x00B320
 #define R_00B324_SPI_SHADER_PGM_HI_ES                                   0x00B324
 #define   S_00B324_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4838,6 +6330,9 @@
 #define   S_00B32C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B32C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B32C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B32C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B32C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B32C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B32C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B32C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B32C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4848,6 +6343,31 @@
 #define   G_00B32C_LDS_SIZE(x)                                        (((x) >> 20) & 0x1FF) /* CIK, for on-chip GS */
 #define   C_00B32C_LDS_SIZE                                           0xE00FFFFF /* CIK, for on-chip GS */
 #define R_00B330_SPI_SHADER_USER_DATA_ES_0                              0x00B330
+#define R_00B334_SPI_SHADER_USER_DATA_ES_1                              0x00B334
+#define R_00B338_SPI_SHADER_USER_DATA_ES_2                              0x00B338
+#define R_00B33C_SPI_SHADER_USER_DATA_ES_3                              0x00B33C
+#define R_00B340_SPI_SHADER_USER_DATA_ES_4                              0x00B340
+#define R_00B344_SPI_SHADER_USER_DATA_ES_5                              0x00B344
+#define R_00B348_SPI_SHADER_USER_DATA_ES_6                              0x00B348
+#define R_00B34C_SPI_SHADER_USER_DATA_ES_7                              0x00B34C
+#define R_00B350_SPI_SHADER_USER_DATA_ES_8                              0x00B350
+#define R_00B354_SPI_SHADER_USER_DATA_ES_9                              0x00B354
+#define R_00B358_SPI_SHADER_USER_DATA_ES_10                             0x00B358
+#define R_00B35C_SPI_SHADER_USER_DATA_ES_11                             0x00B35C
+#define R_00B360_SPI_SHADER_USER_DATA_ES_12                             0x00B360
+#define R_00B364_SPI_SHADER_USER_DATA_ES_13                             0x00B364
+#define R_00B368_SPI_SHADER_USER_DATA_ES_14                             0x00B368
+#define R_00B36C_SPI_SHADER_USER_DATA_ES_15                             0x00B36C
+#define R_00B400_SPI_SHADER_TBA_LO_HS                                   0x00B400
+#define R_00B404_SPI_SHADER_TBA_HI_HS                                   0x00B404
+#define   S_00B404_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B404_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B404_MEM_BASE                                           0xFFFFFF00
+#define R_00B408_SPI_SHADER_TMA_LO_HS                                   0x00B408
+#define R_00B40C_SPI_SHADER_TMA_HI_HS                                   0x00B40C
+#define   S_00B40C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B40C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B40C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B41C_SPI_SHADER_PGM_RSRC3_HS                                0x00B41C
 #define   S_00B41C_WAVE_LIMIT(x)                                      (((x) & 0x3F) << 0)
@@ -4857,6 +6377,11 @@
 #define   G_00B41C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 6) & 0x0F)
 #define   C_00B41C_LOCK_LOW_THRESHOLD                                 0xFFFFFC3F
 /*     */
+/* VI */
+#define   S_00B41C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 10)
+#define   G_00B41C_GROUP_FIFO_DEPTH(x)                                (((x) >> 10) & 0x3F)
+#define   C_00B41C_GROUP_FIFO_DEPTH                                   0xFFFF03FF
+/*    */
 #define R_00B420_SPI_SHADER_PGM_LO_HS                                   0x00B420
 #define R_00B424_SPI_SHADER_PGM_HI_HS                                   0x00B424
 #define   S_00B424_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4902,6 +6427,9 @@
 #define   S_00B42C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B42C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B42C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B42C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B42C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B42C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B42C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B42C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B42C_OC_LDS_EN                                          0xFFFFFF7F
@@ -4912,6 +6440,31 @@
 #define   G_00B42C_EXCP_EN(x)                                         (((x) >> 9) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B42C_EXCP_EN                                            0xFFFF01FF /* mask is 0x1FF on CIK */
 #define R_00B430_SPI_SHADER_USER_DATA_HS_0                              0x00B430
+#define R_00B434_SPI_SHADER_USER_DATA_HS_1                              0x00B434
+#define R_00B438_SPI_SHADER_USER_DATA_HS_2                              0x00B438
+#define R_00B43C_SPI_SHADER_USER_DATA_HS_3                              0x00B43C
+#define R_00B440_SPI_SHADER_USER_DATA_HS_4                              0x00B440
+#define R_00B444_SPI_SHADER_USER_DATA_HS_5                              0x00B444
+#define R_00B448_SPI_SHADER_USER_DATA_HS_6                              0x00B448
+#define R_00B44C_SPI_SHADER_USER_DATA_HS_7                              0x00B44C
+#define R_00B450_SPI_SHADER_USER_DATA_HS_8                              0x00B450
+#define R_00B454_SPI_SHADER_USER_DATA_HS_9                              0x00B454
+#define R_00B458_SPI_SHADER_USER_DATA_HS_10                             0x00B458
+#define R_00B45C_SPI_SHADER_USER_DATA_HS_11                             0x00B45C
+#define R_00B460_SPI_SHADER_USER_DATA_HS_12                             0x00B460
+#define R_00B464_SPI_SHADER_USER_DATA_HS_13                             0x00B464
+#define R_00B468_SPI_SHADER_USER_DATA_HS_14                             0x00B468
+#define R_00B46C_SPI_SHADER_USER_DATA_HS_15                             0x00B46C
+#define R_00B500_SPI_SHADER_TBA_LO_LS                                   0x00B500
+#define R_00B504_SPI_SHADER_TBA_HI_LS                                   0x00B504
+#define   S_00B504_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B504_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B504_MEM_BASE                                           0xFFFFFF00
+#define R_00B508_SPI_SHADER_TMA_LO_LS                                   0x00B508
+#define R_00B50C_SPI_SHADER_TMA_HI_LS                                   0x00B50C
+#define   S_00B50C_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B50C_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B50C_MEM_BASE                                           0xFFFFFF00
 /* CIK */
 #define R_00B51C_SPI_SHADER_PGM_RSRC3_LS                                0x00B51C
 #define   S_00B51C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
@@ -4924,6 +6477,11 @@
 #define   G_00B51C_LOCK_LOW_THRESHOLD(x)                              (((x) >> 22) & 0x0F)
 #define   C_00B51C_LOCK_LOW_THRESHOLD                                 0xFC3FFFFF
 /*     */
+/* VI */
+#define   S_00B51C_GROUP_FIFO_DEPTH(x)                                (((x) & 0x3F) << 26)
+#define   G_00B51C_GROUP_FIFO_DEPTH(x)                                (((x) >> 26) & 0x3F)
+#define   C_00B51C_GROUP_FIFO_DEPTH                                   0x03FFFFFF
+/*    */
 #define R_00B520_SPI_SHADER_PGM_LO_LS                                   0x00B520
 #define R_00B524_SPI_SHADER_PGM_HI_LS                                   0x00B524
 #define   S_00B524_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
@@ -4972,6 +6530,9 @@
 #define   S_00B52C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B52C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B52C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B52C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B52C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B52C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B52C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 7)
 #define   G_00B52C_LDS_SIZE(x)                                        (((x) >> 7) & 0x1FF)
 #define   C_00B52C_LDS_SIZE                                           0xFFFF007F
@@ -4979,6 +6540,21 @@
 #define   G_00B52C_EXCP_EN(x)                                         (((x) >> 16) & 0x7F) /* mask is 0x1FF on CIK */
 #define   C_00B52C_EXCP_EN                                            0xFF80FFFF /* mask is 0x1FF on CIK */
 #define R_00B530_SPI_SHADER_USER_DATA_LS_0                              0x00B530
+#define R_00B534_SPI_SHADER_USER_DATA_LS_1                              0x00B534
+#define R_00B538_SPI_SHADER_USER_DATA_LS_2                              0x00B538
+#define R_00B53C_SPI_SHADER_USER_DATA_LS_3                              0x00B53C
+#define R_00B540_SPI_SHADER_USER_DATA_LS_4                              0x00B540
+#define R_00B544_SPI_SHADER_USER_DATA_LS_5                              0x00B544
+#define R_00B548_SPI_SHADER_USER_DATA_LS_6                              0x00B548
+#define R_00B54C_SPI_SHADER_USER_DATA_LS_7                              0x00B54C
+#define R_00B550_SPI_SHADER_USER_DATA_LS_8                              0x00B550
+#define R_00B554_SPI_SHADER_USER_DATA_LS_9                              0x00B554
+#define R_00B558_SPI_SHADER_USER_DATA_LS_10                             0x00B558
+#define R_00B55C_SPI_SHADER_USER_DATA_LS_11                             0x00B55C
+#define R_00B560_SPI_SHADER_USER_DATA_LS_12                             0x00B560
+#define R_00B564_SPI_SHADER_USER_DATA_LS_13                             0x00B564
+#define R_00B568_SPI_SHADER_USER_DATA_LS_14                             0x00B568
+#define R_00B56C_SPI_SHADER_USER_DATA_LS_15                             0x00B56C
 #define R_00B800_COMPUTE_DISPATCH_INITIATOR                             0x00B800
 #define   S_00B800_COMPUTE_SHADER_EN(x)                               (((x) & 0x1) << 0)
 #define   G_00B800_COMPUTE_SHADER_EN(x)                               (((x) >> 0) & 0x1)
@@ -5049,6 +6625,16 @@
 #define   S_00B82C_MAX_WAVE_ID(x)                                     (((x) & 0xFFF) << 0)
 #define   G_00B82C_MAX_WAVE_ID(x)                                     (((x) >> 0) & 0xFFF)
 #define   C_00B82C_MAX_WAVE_ID                                        0xFFFFF000
+/* CIK */
+#define R_00B828_COMPUTE_PIPELINESTAT_ENABLE                            0x00B828
+#define   S_00B828_PIPELINESTAT_ENABLE(x)                             (((x) & 0x1) << 0)
+#define   G_00B828_PIPELINESTAT_ENABLE(x)                             (((x) >> 0) & 0x1)
+#define   C_00B828_PIPELINESTAT_ENABLE                                0xFFFFFFFE
+#define R_00B82C_COMPUTE_PERFCOUNT_ENABLE                               0x00B82C
+#define   S_00B82C_PERFCOUNT_ENABLE(x)                                (((x) & 0x1) << 0)
+#define   G_00B82C_PERFCOUNT_ENABLE(x)                                (((x) >> 0) & 0x1)
+#define   C_00B82C_PERFCOUNT_ENABLE                                   0xFFFFFFFE
+/*     */
 #define R_00B830_COMPUTE_PGM_LO                                         0x00B830
 #define R_00B834_COMPUTE_PGM_HI                                         0x00B834
 #define   S_00B834_DATA(x)                                            (((x) & 0xFF) << 0)
@@ -5059,6 +6645,16 @@
 #define   G_00B834_INST_ATC(x)                                        (((x) >> 8) & 0x1)
 #define   C_00B834_INST_ATC                                           0xFFFFFEFF
 /*     */
+#define R_00B838_COMPUTE_TBA_LO                                         0x00B838
+#define R_00B83C_COMPUTE_TBA_HI                                         0x00B83C
+#define   S_00B83C_DATA(x)                                            (((x) & 0xFF) << 0)
+#define   G_00B83C_DATA(x)                                            (((x) >> 0) & 0xFF)
+#define   C_00B83C_DATA                                               0xFFFFFF00
+#define R_00B840_COMPUTE_TMA_LO                                         0x00B840
+#define R_00B844_COMPUTE_TMA_HI                                         0x00B844
+#define   S_00B844_DATA(x)                                            (((x) & 0xFF) << 0)
+#define   G_00B844_DATA(x)                                            (((x) >> 0) & 0xFF)
+#define   C_00B844_DATA                                               0xFFFFFF00
 #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
 #define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
 #define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
@@ -5099,6 +6695,9 @@
 #define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B84C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B84C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B84C_TRAP_PRESENT(x)                                    (((x) & 0x1) << 6)
+#define   G_00B84C_TRAP_PRESENT(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B84C_TRAP_PRESENT                                       0xFFFFFFBF
 #define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B84C_TGID_X_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B84C_TGID_X_EN                                          0xFFFFFF7F
@@ -5125,6 +6724,10 @@
 #define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
 #define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
 #define   C_00B84C_EXCP_EN                                            0x80FFFFFF
+#define R_00B850_COMPUTE_VMID                                           0x00B850
+#define   S_00B850_DATA(x)                                            (((x) & 0x0F) << 0)
+#define   G_00B850_DATA(x)                                            (((x) >> 0) & 0x0F)
+#define   C_00B850_DATA                                               0xFFFFFFF0
 #define R_00B854_COMPUTE_RESOURCE_LIMITS                                0x00B854
 #define   S_00B854_WAVES_PER_SH(x)                                    (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */
 #define   G_00B854_WAVES_PER_SH(x)                                    (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */
@@ -5167,7 +6770,84 @@
 #define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
 #define   G_00B860_WAVESIZE(x)                                        (((x) >> 12) & 0x1FFF)
 #define   C_00B860_WAVESIZE                                           0xFE000FFF
+/* CIK */
+#define R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2                         0x00B864
+#define   S_00B864_SH0_CU_EN(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_00B864_SH0_CU_EN(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_00B864_SH0_CU_EN                                          0xFFFF0000
+#define   S_00B864_SH1_CU_EN(x)                                       (((x) & 0xFFFF) << 16)
+#define   G_00B864_SH1_CU_EN(x)                                       (((x) >> 16) & 0xFFFF)
+#define   C_00B864_SH1_CU_EN                                          0x0000FFFF
+#define R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3                         0x00B868
+#define   S_00B868_SH0_CU_EN(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_00B868_SH0_CU_EN(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_00B868_SH0_CU_EN                                          0xFFFF0000
+#define   S_00B868_SH1_CU_EN(x)                                       (((x) & 0xFFFF) << 16)
+#define   G_00B868_SH1_CU_EN(x)                                       (((x) >> 16) & 0xFFFF)
+#define   C_00B868_SH1_CU_EN                                          0x0000FFFF
+#define R_00B86C_COMPUTE_RESTART_X                                      0x00B86C
+#define R_00B870_COMPUTE_RESTART_Y                                      0x00B870
+#define R_00B874_COMPUTE_RESTART_Z                                      0x00B874
+#define R_00B87C_COMPUTE_MISC_RESERVED                                  0x00B87C
+#define   S_00B87C_SEND_SEID(x)                                       (((x) & 0x03) << 0)
+#define   G_00B87C_SEND_SEID(x)                                       (((x) >> 0) & 0x03)
+#define   C_00B87C_SEND_SEID                                          0xFFFFFFFC
+#define   S_00B87C_RESERVED2(x)                                       (((x) & 0x1) << 2)
+#define   G_00B87C_RESERVED2(x)                                       (((x) >> 2) & 0x1)
+#define   C_00B87C_RESERVED2                                          0xFFFFFFFB
+#define   S_00B87C_RESERVED3(x)                                       (((x) & 0x1) << 3)
+#define   G_00B87C_RESERVED3(x)                                       (((x) >> 3) & 0x1)
+#define   C_00B87C_RESERVED3                                          0xFFFFFFF7
+#define   S_00B87C_RESERVED4(x)                                       (((x) & 0x1) << 4)
+#define   G_00B87C_RESERVED4(x)                                       (((x) >> 4) & 0x1)
+#define   C_00B87C_RESERVED4                                          0xFFFFFFEF
+/* VI */
+#define   S_00B87C_WAVE_ID_BASE(x)                                    (((x) & 0xFFF) << 5)
+#define   G_00B87C_WAVE_ID_BASE(x)                                    (((x) >> 5) & 0xFFF)
+#define   C_00B87C_WAVE_ID_BASE                                       0xFFFE001F
+#define R_00B880_COMPUTE_DISPATCH_ID                                    0x00B880
+#define R_00B884_COMPUTE_THREADGROUP_ID                                 0x00B884
+#define R_00B888_COMPUTE_RELAUNCH                                       0x00B888
+#define   S_00B888_PAYLOAD(x)                                         (((x) & 0x3FFFFFFF) << 0)
+#define   G_00B888_PAYLOAD(x)                                         (((x) >> 0) & 0x3FFFFFFF)
+#define   C_00B888_PAYLOAD                                            0xC0000000
+#define   S_00B888_IS_EVENT(x)                                        (((x) & 0x1) << 30)
+#define   G_00B888_IS_EVENT(x)                                        (((x) >> 30) & 0x1)
+#define   C_00B888_IS_EVENT                                           0xBFFFFFFF
+#define   S_00B888_IS_STATE(x)                                        (((x) & 0x1) << 31)
+#define   G_00B888_IS_STATE(x)                                        (((x) >> 31) & 0x1)
+#define   C_00B888_IS_STATE                                           0x7FFFFFFF
+#define R_00B88C_COMPUTE_WAVE_RESTORE_ADDR_LO                           0x00B88C
+#define R_00B890_COMPUTE_WAVE_RESTORE_ADDR_HI                           0x00B890
+#define   S_00B890_ADDR(x)                                            (((x) & 0xFFFF) << 0)
+#define   G_00B890_ADDR(x)                                            (((x) >> 0) & 0xFFFF)
+#define   C_00B890_ADDR                                               0xFFFF0000
+#define R_00B894_COMPUTE_WAVE_RESTORE_CONTROL                           0x00B894
+#define   S_00B894_ATC(x)                                             (((x) & 0x1) << 0)
+#define   G_00B894_ATC(x)                                             (((x) >> 0) & 0x1)
+#define   C_00B894_ATC                                                0xFFFFFFFE
+#define   S_00B894_MTYPE(x)                                           (((x) & 0x03) << 1)
+#define   G_00B894_MTYPE(x)                                           (((x) >> 1) & 0x03)
+#define   C_00B894_MTYPE                                              0xFFFFFFF9
+/*    */
+/*     */
 #define R_00B900_COMPUTE_USER_DATA_0                                    0x00B900
+#define R_00B904_COMPUTE_USER_DATA_1                                    0x00B904
+#define R_00B908_COMPUTE_USER_DATA_2                                    0x00B908
+#define R_00B90C_COMPUTE_USER_DATA_3                                    0x00B90C
+#define R_00B910_COMPUTE_USER_DATA_4                                    0x00B910
+#define R_00B914_COMPUTE_USER_DATA_5                                    0x00B914
+#define R_00B918_COMPUTE_USER_DATA_6                                    0x00B918
+#define R_00B91C_COMPUTE_USER_DATA_7                                    0x00B91C
+#define R_00B920_COMPUTE_USER_DATA_8                                    0x00B920
+#define R_00B924_COMPUTE_USER_DATA_9                                    0x00B924
+#define R_00B928_COMPUTE_USER_DATA_10                                   0x00B928
+#define R_00B92C_COMPUTE_USER_DATA_11                                   0x00B92C
+#define R_00B930_COMPUTE_USER_DATA_12                                   0x00B930
+#define R_00B934_COMPUTE_USER_DATA_13                                   0x00B934
+#define R_00B938_COMPUTE_USER_DATA_14                                   0x00B938
+#define R_00B93C_COMPUTE_USER_DATA_15                                   0x00B93C
+#define R_00B9FC_COMPUTE_NOWHERE                                        0x00B9FC
 #define R_028000_DB_RENDER_CONTROL                                      0x028000
 #define   S_028000_DEPTH_CLEAR_ENABLE(x)                              (((x) & 0x1) << 0)
 #define   G_028000_DEPTH_CLEAR_ENABLE(x)                              (((x) >> 0) & 0x1)
@@ -5196,6 +6876,11 @@
 #define   S_028000_COPY_SAMPLE(x)                                     (((x) & 0x0F) << 8)
 #define   G_028000_COPY_SAMPLE(x)                                     (((x) >> 8) & 0x0F)
 #define   C_028000_COPY_SAMPLE                                        0xFFFFF0FF
+/* VI */
+#define   S_028000_DECOMPRESS_ENABLE(x)                               (((x) & 0x1) << 12)
+#define   G_028000_DECOMPRESS_ENABLE(x)                               (((x) >> 12) & 0x1)
+#define   C_028000_DECOMPRESS_ENABLE                                  0xFFFFEFFF
+/*    */
 #define R_028004_DB_COUNT_CONTROL                                       0x028004
 #define   S_028004_ZPASS_INCREMENT_DISABLE(x)                         (((x) & 0x1) << 0)
 #define   G_028004_ZPASS_INCREMENT_DISABLE(x)                         (((x) >> 0) & 0x1)
@@ -5474,9 +7159,6 @@
 #define   S_028040_NUM_SAMPLES(x)                                     (((x) & 0x03) << 2)
 #define   G_028040_NUM_SAMPLES(x)                                     (((x) >> 2) & 0x03)
 #define   C_028040_NUM_SAMPLES                                        0xFFFFFFF3
-#define   S_028040_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
-#define   G_028040_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
-#define   C_028040_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 /* CIK */
 #define   S_028040_TILE_SPLIT(x)                                      (((x) & 0x07) << 13)
 #define   G_028040_TILE_SPLIT(x)                                      (((x) >> 13) & 0x07)
@@ -5489,6 +7171,14 @@
 #define     V_028040_ADDR_SURF_TILE_SPLIT_2KB                       0x05
 #define     V_028040_ADDR_SURF_TILE_SPLIT_4KB                       0x06
 /*     */
+#define   S_028040_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
+#define   G_028040_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
+#define   C_028040_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
+/* VI */
+#define   S_028040_DECOMPRESS_ON_N_ZPLANES(x)                         (((x) & 0x0F) << 23)
+#define   G_028040_DECOMPRESS_ON_N_ZPLANES(x)                         (((x) >> 23) & 0x0F)
+#define   C_028040_DECOMPRESS_ON_N_ZPLANES                            0xF87FFFFF
+/*    */
 #define   S_028040_ALLOW_EXPCLEAR(x)                                  (((x) & 0x1) << 27)
 #define   G_028040_ALLOW_EXPCLEAR(x)                                  (((x) >> 27) & 0x1)
 #define   C_028040_ALLOW_EXPCLEAR                                     0xF7FFFFFF
@@ -5498,6 +7188,11 @@
 #define   S_028040_TILE_SURFACE_ENABLE(x)                             (((x) & 0x1) << 29)
 #define   G_028040_TILE_SURFACE_ENABLE(x)                             (((x) >> 29) & 0x1)
 #define   C_028040_TILE_SURFACE_ENABLE                                0xDFFFFFFF
+/* VI */
+#define   S_028040_CLEAR_DISALLOWED(x)                                (((x) & 0x1) << 30)
+#define   G_028040_CLEAR_DISALLOWED(x)                                (((x) >> 30) & 0x1)
+#define   C_028040_CLEAR_DISALLOWED                                   0xBFFFFFFF
+/*    */
 #define   S_028040_ZRANGE_PRECISION(x)                                (((x) & 0x1) << 31)
 #define   G_028040_ZRANGE_PRECISION(x)                                (((x) >> 31) & 0x1)
 #define   C_028040_ZRANGE_PRECISION                                   0x7FFFFFFF
@@ -5507,9 +7202,6 @@
 #define   C_028044_FORMAT                                             0xFFFFFFFE
 #define     V_028044_STENCIL_INVALID                                0x00
 #define     V_028044_STENCIL_8                                      0x01
-#define   S_028044_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
-#define   G_028044_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
-#define   C_028044_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 /* CIK */
 #define   S_028044_TILE_SPLIT(x)                                      (((x) & 0x07) << 13)
 #define   G_028044_TILE_SPLIT(x)                                      (((x) >> 13) & 0x07)
@@ -5522,12 +7214,20 @@
 #define     V_028044_ADDR_SURF_TILE_SPLIT_2KB                       0x05
 #define     V_028044_ADDR_SURF_TILE_SPLIT_4KB                       0x06
 /*     */
+#define   S_028044_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20) /* not on CIK */
+#define   G_028044_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07) /* not on CIK */
+#define   C_028044_TILE_MODE_INDEX                                    0xFF8FFFFF /* not on CIK */
 #define   S_028044_ALLOW_EXPCLEAR(x)                                  (((x) & 0x1) << 27)
 #define   G_028044_ALLOW_EXPCLEAR(x)                                  (((x) >> 27) & 0x1)
 #define   C_028044_ALLOW_EXPCLEAR                                     0xF7FFFFFF
 #define   S_028044_TILE_STENCIL_DISABLE(x)                            (((x) & 0x1) << 29)
 #define   G_028044_TILE_STENCIL_DISABLE(x)                            (((x) >> 29) & 0x1)
 #define   C_028044_TILE_STENCIL_DISABLE                               0xDFFFFFFF
+/* VI */
+#define   S_028044_CLEAR_DISALLOWED(x)                                (((x) & 0x1) << 30)
+#define   G_028044_CLEAR_DISALLOWED(x)                                (((x) >> 30) & 0x1)
+#define   C_028044_CLEAR_DISALLOWED                                   0xBFFFFFFF
+/*    */
 #define R_028048_DB_Z_READ_BASE                                         0x028048
 #define R_02804C_DB_STENCIL_READ_BASE                                   0x02804C
 #define R_028050_DB_Z_WRITE_BASE                                        0x028050
@@ -5549,7 +7249,13 @@
 #define   S_028084_ADDRESS(x)                                         (((x) & 0xFF) << 0)
 #define   G_028084_ADDRESS(x)                                         (((x) >> 0) & 0xFF)
 #define   C_028084_ADDRESS                                            0xFFFFFF00
-/* */
+#define R_0281E8_COHER_DEST_BASE_HI_0                                   0x0281E8
+#define R_0281EC_COHER_DEST_BASE_HI_1                                   0x0281EC
+#define R_0281F0_COHER_DEST_BASE_HI_2                                   0x0281F0
+#define R_0281F4_COHER_DEST_BASE_HI_3                                   0x0281F4
+/*     */
+#define R_0281F8_COHER_DEST_BASE_2                                      0x0281F8
+#define R_0281FC_COHER_DEST_BASE_3                                      0x0281FC
 #define R_028200_PA_SC_WINDOW_OFFSET                                    0x028200
 #define   S_028200_WINDOW_X_OFFSET(x)                                 (((x) & 0xFFFF) << 0)
 #define   G_028200_WINDOW_X_OFFSET(x)                                 (((x) >> 0) & 0xFFFF)
@@ -5694,6 +7400,8 @@
 #define   S_028244_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
 #define   G_028244_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
 #define   C_028244_BR_Y                                               0x8000FFFF
+#define R_028248_COHER_DEST_BASE_0                                      0x028248
+#define R_02824C_COHER_DEST_BASE_1                                      0x02824C
 #define R_028250_PA_SC_VPORT_SCISSOR_0_TL                               0x028250
 #define   S_028250_TL_X(x)                                            (((x) & 0x7FFF) << 0)
 #define   G_028250_TL_X(x)                                            (((x) >> 0) & 0x7FFF)
@@ -5711,8 +7419,68 @@
 #define   S_028254_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
 #define   G_028254_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
 #define   C_028254_BR_Y                                               0x8000FFFF
+#define R_028258_PA_SC_VPORT_SCISSOR_1_TL                               0x028258
+#define R_02825C_PA_SC_VPORT_SCISSOR_1_BR                               0x02825C
+#define R_028260_PA_SC_VPORT_SCISSOR_2_TL                               0x028260
+#define R_028264_PA_SC_VPORT_SCISSOR_2_BR                               0x028264
+#define R_028268_PA_SC_VPORT_SCISSOR_3_TL                               0x028268
+#define R_02826C_PA_SC_VPORT_SCISSOR_3_BR                               0x02826C
+#define R_028270_PA_SC_VPORT_SCISSOR_4_TL                               0x028270
+#define R_028274_PA_SC_VPORT_SCISSOR_4_BR                               0x028274
+#define R_028278_PA_SC_VPORT_SCISSOR_5_TL                               0x028278
+#define R_02827C_PA_SC_VPORT_SCISSOR_5_BR                               0x02827C
+#define R_028280_PA_SC_VPORT_SCISSOR_6_TL                               0x028280
+#define R_028284_PA_SC_VPORT_SCISSOR_6_BR                               0x028284
+#define R_028288_PA_SC_VPORT_SCISSOR_7_TL                               0x028288
+#define R_02828C_PA_SC_VPORT_SCISSOR_7_BR                               0x02828C
+#define R_028290_PA_SC_VPORT_SCISSOR_8_TL                               0x028290
+#define R_028294_PA_SC_VPORT_SCISSOR_8_BR                               0x028294
+#define R_028298_PA_SC_VPORT_SCISSOR_9_TL                               0x028298
+#define R_02829C_PA_SC_VPORT_SCISSOR_9_BR                               0x02829C
+#define R_0282A0_PA_SC_VPORT_SCISSOR_10_TL                              0x0282A0
+#define R_0282A4_PA_SC_VPORT_SCISSOR_10_BR                              0x0282A4
+#define R_0282A8_PA_SC_VPORT_SCISSOR_11_TL                              0x0282A8
+#define R_0282AC_PA_SC_VPORT_SCISSOR_11_BR                              0x0282AC
+#define R_0282B0_PA_SC_VPORT_SCISSOR_12_TL                              0x0282B0
+#define R_0282B4_PA_SC_VPORT_SCISSOR_12_BR                              0x0282B4
+#define R_0282B8_PA_SC_VPORT_SCISSOR_13_TL                              0x0282B8
+#define R_0282BC_PA_SC_VPORT_SCISSOR_13_BR                              0x0282BC
+#define R_0282C0_PA_SC_VPORT_SCISSOR_14_TL                              0x0282C0
+#define R_0282C4_PA_SC_VPORT_SCISSOR_14_BR                              0x0282C4
+#define R_0282C8_PA_SC_VPORT_SCISSOR_15_TL                              0x0282C8
+#define R_0282CC_PA_SC_VPORT_SCISSOR_15_BR                              0x0282CC
 #define R_0282D0_PA_SC_VPORT_ZMIN_0                                     0x0282D0
 #define R_0282D4_PA_SC_VPORT_ZMAX_0                                     0x0282D4
+#define R_0282D8_PA_SC_VPORT_ZMIN_1                                     0x0282D8
+#define R_0282DC_PA_SC_VPORT_ZMAX_1                                     0x0282DC
+#define R_0282E0_PA_SC_VPORT_ZMIN_2                                     0x0282E0
+#define R_0282E4_PA_SC_VPORT_ZMAX_2                                     0x0282E4
+#define R_0282E8_PA_SC_VPORT_ZMIN_3                                     0x0282E8
+#define R_0282EC_PA_SC_VPORT_ZMAX_3                                     0x0282EC
+#define R_0282F0_PA_SC_VPORT_ZMIN_4                                     0x0282F0
+#define R_0282F4_PA_SC_VPORT_ZMAX_4                                     0x0282F4
+#define R_0282F8_PA_SC_VPORT_ZMIN_5                                     0x0282F8
+#define R_0282FC_PA_SC_VPORT_ZMAX_5                                     0x0282FC
+#define R_028300_PA_SC_VPORT_ZMIN_6                                     0x028300
+#define R_028304_PA_SC_VPORT_ZMAX_6                                     0x028304
+#define R_028308_PA_SC_VPORT_ZMIN_7                                     0x028308
+#define R_02830C_PA_SC_VPORT_ZMAX_7                                     0x02830C
+#define R_028310_PA_SC_VPORT_ZMIN_8                                     0x028310
+#define R_028314_PA_SC_VPORT_ZMAX_8                                     0x028314
+#define R_028318_PA_SC_VPORT_ZMIN_9                                     0x028318
+#define R_02831C_PA_SC_VPORT_ZMAX_9                                     0x02831C
+#define R_028320_PA_SC_VPORT_ZMIN_10                                    0x028320
+#define R_028324_PA_SC_VPORT_ZMAX_10                                    0x028324
+#define R_028328_PA_SC_VPORT_ZMIN_11                                    0x028328
+#define R_02832C_PA_SC_VPORT_ZMAX_11                                    0x02832C
+#define R_028330_PA_SC_VPORT_ZMIN_12                                    0x028330
+#define R_028334_PA_SC_VPORT_ZMAX_12                                    0x028334
+#define R_028338_PA_SC_VPORT_ZMIN_13                                    0x028338
+#define R_02833C_PA_SC_VPORT_ZMAX_13                                    0x02833C
+#define R_028340_PA_SC_VPORT_ZMIN_14                                    0x028340
+#define R_028344_PA_SC_VPORT_ZMAX_14                                    0x028344
+#define R_028348_PA_SC_VPORT_ZMIN_15                                    0x028348
+#define R_02834C_PA_SC_VPORT_ZMAX_15                                    0x02834C
 #define R_028350_PA_SC_RASTER_CONFIG                                    0x028350
 #define   S_028350_RB_MAP_PKR0(x)                                     (((x) & 0x03) << 0)
 #define   G_028350_RB_MAP_PKR0(x)                                     (((x) >> 0) & 0x03)
@@ -5834,6 +7602,13 @@
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_16_WIDE_TILE        0x01
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_32_WIDE_TILE        0x02
 #define     V_028354_RASTER_CONFIG_SE_PAIR_YSEL_64_WIDE_TILE        0x03
+#define R_028358_PA_SC_SCREEN_EXTENT_CONTROL                            0x028358
+#define   S_028358_SLICE_EVEN_ENABLE(x)                               (((x) & 0x03) << 0)
+#define   G_028358_SLICE_EVEN_ENABLE(x)                               (((x) >> 0) & 0x03)
+#define   C_028358_SLICE_EVEN_ENABLE                                  0xFFFFFFFC
+#define   S_028358_SLICE_ODD_ENABLE(x)                                (((x) & 0x03) << 2)
+#define   G_028358_SLICE_ODD_ENABLE(x)                                (((x) >> 2) & 0x03)
+#define   C_028358_SLICE_ODD_ENABLE                                   0xFFFFFFF3
 /*     */
 #define R_028400_VGT_MAX_VTX_INDX                                       0x028400
 #define R_028404_VGT_MIN_VTX_INDX                                       0x028404
@@ -5843,6 +7618,18 @@
 #define R_028418_CB_BLEND_GREEN                                         0x028418
 #define R_02841C_CB_BLEND_BLUE                                          0x02841C
 #define R_028420_CB_BLEND_ALPHA                                         0x028420
+/* VI */
+#define R_028424_CB_DCC_CONTROL                                         0x028424
+#define   S_028424_OVERWRITE_COMBINER_DISABLE(x)                      (((x) & 0x1) << 0)
+#define   G_028424_OVERWRITE_COMBINER_DISABLE(x)                      (((x) >> 0) & 0x1)
+#define   C_028424_OVERWRITE_COMBINER_DISABLE                         0xFFFFFFFE
+#define   S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(x)          (((x) & 0x1) << 1)
+#define   G_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(x)          (((x) >> 1) & 0x1)
+#define   C_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE             0xFFFFFFFD
+#define   S_028424_OVERWRITE_COMBINER_WATERMARK(x)                    (((x) & 0x1F) << 2)
+#define   G_028424_OVERWRITE_COMBINER_WATERMARK(x)                    (((x) >> 2) & 0x1F)
+#define   C_028424_OVERWRITE_COMBINER_WATERMARK                       0xFFFFFF83
+/*    */
 #define R_02842C_DB_STENCIL_CONTROL                                     0x02842C
 #define   S_02842C_STENCILFAIL(x)                                     (((x) & 0x0F) << 0)
 #define   G_02842C_STENCILFAIL(x)                                     (((x) >> 0) & 0x0F)
@@ -5984,12 +7771,102 @@
 #define   S_028434_STENCILOPVAL_BF(x)                                 (((x) & 0xFF) << 24)
 #define   G_028434_STENCILOPVAL_BF(x)                                 (((x) >> 24) & 0xFF)
 #define   C_028434_STENCILOPVAL_BF                                    0x00FFFFFF
-#define R_02843C_PA_CL_VPORT_XSCALE_0                                   0x02843C
-#define R_028440_PA_CL_VPORT_XOFFSET_0                                  0x028440
-#define R_028444_PA_CL_VPORT_YSCALE_0                                   0x028444
-#define R_028448_PA_CL_VPORT_YOFFSET_0                                  0x028448
-#define R_02844C_PA_CL_VPORT_ZSCALE_0                                   0x02844C
-#define R_028450_PA_CL_VPORT_ZOFFSET_0                                  0x028450
+#define R_02843C_PA_CL_VPORT_XSCALE                                     0x02843C
+#define R_028440_PA_CL_VPORT_XOFFSET                                    0x028440
+#define R_028444_PA_CL_VPORT_YSCALE                                     0x028444
+#define R_028448_PA_CL_VPORT_YOFFSET                                    0x028448
+#define R_02844C_PA_CL_VPORT_ZSCALE                                     0x02844C
+#define R_028450_PA_CL_VPORT_ZOFFSET                                    0x028450
+#define R_028454_PA_CL_VPORT_XSCALE_1                                   0x028454
+#define R_028458_PA_CL_VPORT_XOFFSET_1                                  0x028458
+#define R_02845C_PA_CL_VPORT_YSCALE_1                                   0x02845C
+#define R_028460_PA_CL_VPORT_YOFFSET_1                                  0x028460
+#define R_028464_PA_CL_VPORT_ZSCALE_1                                   0x028464
+#define R_028468_PA_CL_VPORT_ZOFFSET_1                                  0x028468
+#define R_02846C_PA_CL_VPORT_XSCALE_2                                   0x02846C
+#define R_028470_PA_CL_VPORT_XOFFSET_2                                  0x028470
+#define R_028474_PA_CL_VPORT_YSCALE_2                                   0x028474
+#define R_028478_PA_CL_VPORT_YOFFSET_2                                  0x028478
+#define R_02847C_PA_CL_VPORT_ZSCALE_2                                   0x02847C
+#define R_028480_PA_CL_VPORT_ZOFFSET_2                                  0x028480
+#define R_028484_PA_CL_VPORT_XSCALE_3                                   0x028484
+#define R_028488_PA_CL_VPORT_XOFFSET_3                                  0x028488
+#define R_02848C_PA_CL_VPORT_YSCALE_3                                   0x02848C
+#define R_028490_PA_CL_VPORT_YOFFSET_3                                  0x028490
+#define R_028494_PA_CL_VPORT_ZSCALE_3                                   0x028494
+#define R_028498_PA_CL_VPORT_ZOFFSET_3                                  0x028498
+#define R_02849C_PA_CL_VPORT_XSCALE_4                                   0x02849C
+#define R_0284A0_PA_CL_VPORT_XOFFSET_4                                  0x0284A0
+#define R_0284A4_PA_CL_VPORT_YSCALE_4                                   0x0284A4
+#define R_0284A8_PA_CL_VPORT_YOFFSET_4                                  0x0284A8
+#define R_0284AC_PA_CL_VPORT_ZSCALE_4                                   0x0284AC
+#define R_0284B0_PA_CL_VPORT_ZOFFSET_4                                  0x0284B0
+#define R_0284B4_PA_CL_VPORT_XSCALE_5                                   0x0284B4
+#define R_0284B8_PA_CL_VPORT_XOFFSET_5                                  0x0284B8
+#define R_0284BC_PA_CL_VPORT_YSCALE_5                                   0x0284BC
+#define R_0284C0_PA_CL_VPORT_YOFFSET_5                                  0x0284C0
+#define R_0284C4_PA_CL_VPORT_ZSCALE_5                                   0x0284C4
+#define R_0284C8_PA_CL_VPORT_ZOFFSET_5                                  0x0284C8
+#define R_0284CC_PA_CL_VPORT_XSCALE_6                                   0x0284CC
+#define R_0284D0_PA_CL_VPORT_XOFFSET_6                                  0x0284D0
+#define R_0284D4_PA_CL_VPORT_YSCALE_6                                   0x0284D4
+#define R_0284D8_PA_CL_VPORT_YOFFSET_6                                  0x0284D8
+#define R_0284DC_PA_CL_VPORT_ZSCALE_6                                   0x0284DC
+#define R_0284E0_PA_CL_VPORT_ZOFFSET_6                                  0x0284E0
+#define R_0284E4_PA_CL_VPORT_XSCALE_7                                   0x0284E4
+#define R_0284E8_PA_CL_VPORT_XOFFSET_7                                  0x0284E8
+#define R_0284EC_PA_CL_VPORT_YSCALE_7                                   0x0284EC
+#define R_0284F0_PA_CL_VPORT_YOFFSET_7                                  0x0284F0
+#define R_0284F4_PA_CL_VPORT_ZSCALE_7                                   0x0284F4
+#define R_0284F8_PA_CL_VPORT_ZOFFSET_7                                  0x0284F8
+#define R_0284FC_PA_CL_VPORT_XSCALE_8                                   0x0284FC
+#define R_028500_PA_CL_VPORT_XOFFSET_8                                  0x028500
+#define R_028504_PA_CL_VPORT_YSCALE_8                                   0x028504
+#define R_028508_PA_CL_VPORT_YOFFSET_8                                  0x028508
+#define R_02850C_PA_CL_VPORT_ZSCALE_8                                   0x02850C
+#define R_028510_PA_CL_VPORT_ZOFFSET_8                                  0x028510
+#define R_028514_PA_CL_VPORT_XSCALE_9                                   0x028514
+#define R_028518_PA_CL_VPORT_XOFFSET_9                                  0x028518
+#define R_02851C_PA_CL_VPORT_YSCALE_9                                   0x02851C
+#define R_028520_PA_CL_VPORT_YOFFSET_9                                  0x028520
+#define R_028524_PA_CL_VPORT_ZSCALE_9                                   0x028524
+#define R_028528_PA_CL_VPORT_ZOFFSET_9                                  0x028528
+#define R_02852C_PA_CL_VPORT_XSCALE_10                                  0x02852C
+#define R_028530_PA_CL_VPORT_XOFFSET_10                                 0x028530
+#define R_028534_PA_CL_VPORT_YSCALE_10                                  0x028534
+#define R_028538_PA_CL_VPORT_YOFFSET_10                                 0x028538
+#define R_02853C_PA_CL_VPORT_ZSCALE_10                                  0x02853C
+#define R_028540_PA_CL_VPORT_ZOFFSET_10                                 0x028540
+#define R_028544_PA_CL_VPORT_XSCALE_11                                  0x028544
+#define R_028548_PA_CL_VPORT_XOFFSET_11                                 0x028548
+#define R_02854C_PA_CL_VPORT_YSCALE_11                                  0x02854C
+#define R_028550_PA_CL_VPORT_YOFFSET_11                                 0x028550
+#define R_028554_PA_CL_VPORT_ZSCALE_11                                  0x028554
+#define R_028558_PA_CL_VPORT_ZOFFSET_11                                 0x028558
+#define R_02855C_PA_CL_VPORT_XSCALE_12                                  0x02855C
+#define R_028560_PA_CL_VPORT_XOFFSET_12                                 0x028560
+#define R_028564_PA_CL_VPORT_YSCALE_12                                  0x028564
+#define R_028568_PA_CL_VPORT_YOFFSET_12                                 0x028568
+#define R_02856C_PA_CL_VPORT_ZSCALE_12                                  0x02856C
+#define R_028570_PA_CL_VPORT_ZOFFSET_12                                 0x028570
+#define R_028574_PA_CL_VPORT_XSCALE_13                                  0x028574
+#define R_028578_PA_CL_VPORT_XOFFSET_13                                 0x028578
+#define R_02857C_PA_CL_VPORT_YSCALE_13                                  0x02857C
+#define R_028580_PA_CL_VPORT_YOFFSET_13                                 0x028580
+#define R_028584_PA_CL_VPORT_ZSCALE_13                                  0x028584
+#define R_028588_PA_CL_VPORT_ZOFFSET_13                                 0x028588
+#define R_02858C_PA_CL_VPORT_XSCALE_14                                  0x02858C
+#define R_028590_PA_CL_VPORT_XOFFSET_14                                 0x028590
+#define R_028594_PA_CL_VPORT_YSCALE_14                                  0x028594
+#define R_028598_PA_CL_VPORT_YOFFSET_14                                 0x028598
+#define R_02859C_PA_CL_VPORT_ZSCALE_14                                  0x02859C
+#define R_0285A0_PA_CL_VPORT_ZOFFSET_14                                 0x0285A0
+#define R_0285A4_PA_CL_VPORT_XSCALE_15                                  0x0285A4
+#define R_0285A8_PA_CL_VPORT_XOFFSET_15                                 0x0285A8
+#define R_0285AC_PA_CL_VPORT_YSCALE_15                                  0x0285AC
+#define R_0285B0_PA_CL_VPORT_YOFFSET_15                                 0x0285B0
+#define R_0285B4_PA_CL_VPORT_ZSCALE_15                                  0x0285B4
+#define R_0285B8_PA_CL_VPORT_ZOFFSET_15                                 0x0285B8
 #define R_0285BC_PA_CL_UCP_0_X                                          0x0285BC
 #define R_0285C0_PA_CL_UCP_0_Y                                          0x0285C0
 #define R_0285C4_PA_CL_UCP_0_Z                                          0x0285C4
@@ -6036,6 +7913,26 @@
 #define   G_028644_DUP(x)                                             (((x) >> 18) & 0x1)
 #define   C_028644_DUP                                                0xFFFBFFFF
 /*     */
+/* VI */
+#define   S_028644_FP16_INTERP_MODE(x)                                (((x) & 0x1) << 19)
+#define   G_028644_FP16_INTERP_MODE(x)                                (((x) >> 19) & 0x1)
+#define   C_028644_FP16_INTERP_MODE                                   0xFFF7FFFF
+#define   S_028644_USE_DEFAULT_ATTR1(x)                               (((x) & 0x1) << 20)
+#define   G_028644_USE_DEFAULT_ATTR1(x)                               (((x) >> 20) & 0x1)
+#define   C_028644_USE_DEFAULT_ATTR1                                  0xFFEFFFFF
+#define   S_028644_DEFAULT_VAL_ATTR1(x)                               (((x) & 0x03) << 21)
+#define   G_028644_DEFAULT_VAL_ATTR1(x)                               (((x) >> 21) & 0x03)
+#define   C_028644_DEFAULT_VAL_ATTR1                                  0xFF9FFFFF
+#define   S_028644_PT_SPRITE_TEX_ATTR1(x)                             (((x) & 0x1) << 23)
+#define   G_028644_PT_SPRITE_TEX_ATTR1(x)                             (((x) >> 23) & 0x1)
+#define   C_028644_PT_SPRITE_TEX_ATTR1                                0xFF7FFFFF
+#define   S_028644_ATTR0_VALID(x)                                     (((x) & 0x1) << 24)
+#define   G_028644_ATTR0_VALID(x)                                     (((x) >> 24) & 0x1)
+#define   C_028644_ATTR0_VALID                                        0xFEFFFFFF
+#define   S_028644_ATTR1_VALID(x)                                     (((x) & 0x1) << 25)
+#define   G_028644_ATTR1_VALID(x)                                     (((x) >> 25) & 0x1)
+#define   C_028644_ATTR1_VALID                                        0xFDFFFFFF
+/*    */
 #define R_028648_SPI_PS_INPUT_CNTL_1                                    0x028648
 #define R_02864C_SPI_PS_INPUT_CNTL_2                                    0x02864C
 #define R_028650_SPI_PS_INPUT_CNTL_3                                    0x028650
@@ -6559,6 +8456,10 @@
 #define R_028794_CB_BLEND5_CONTROL                                      0x028794
 #define R_028798_CB_BLEND6_CONTROL                                      0x028798
 #define R_02879C_CB_BLEND7_CONTROL                                      0x02879C
+#define R_0287CC_CS_COPY_STATE                                          0x0287CC
+#define   S_0287CC_SRC_STATE_ID(x)                                    (((x) & 0x07) << 0)
+#define   G_0287CC_SRC_STATE_ID(x)                                    (((x) >> 0) & 0x07)
+#define   C_0287CC_SRC_STATE_ID                                       0xFFFFFFF8
 #define R_0287D4_PA_CL_POINT_X_RAD                                      0x0287D4
 #define R_0287D8_PA_CL_POINT_Y_RAD                                      0x0287D8
 #define R_0287DC_PA_CL_POINT_SIZE                                       0x0287DC
@@ -6588,6 +8489,10 @@
 #define   G_0287F0_USE_OPAQUE(x)                                      (((x) >> 6) & 0x1)
 #define   C_0287F0_USE_OPAQUE                                         0xFFFFFFBF
 #define R_0287F4_VGT_IMMED_DATA                                         0x0287F4 /* not on CIK */
+#define R_0287F8_VGT_EVENT_ADDRESS_REG                                  0x0287F8
+#define   S_0287F8_ADDRESS_LOW(x)                                     (((x) & 0xFFFFFFF) << 0)
+#define   G_0287F8_ADDRESS_LOW(x)                                     (((x) >> 0) & 0xFFFFFFF)
+#define   C_0287F8_ADDRESS_LOW                                        0xF0000000
 #define R_028800_DB_DEPTH_CONTROL                                       0x028800
 #define   S_028800_STENCIL_ENABLE(x)                                  (((x) & 0x1) << 0)
 #define   G_028800_STENCIL_ENABLE(x)                                  (((x) >> 0) & 0x1)
@@ -6644,36 +8549,42 @@
 #define   G_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS(x)              (((x) >> 31) & 0x1)
 #define   C_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS                 0x7FFFFFFF
 #define R_028804_DB_EQAA                                                0x028804
-#define   S_028804_MAX_ANCHOR_SAMPLES(x)		(((x) & 0x7) << 0)
-#define   G_028804_MAX_ANCHOR_SAMPLES(x)		(((x) >> 0) & 0x7)
-#define   C_028804_MAX_ANCHOR_SAMPLES			(~(((~0) & 0x7) << 0))
-#define   S_028804_PS_ITER_SAMPLES(x)			(((x) & 0x7) << 4)
-#define   G_028804_PS_ITER_SAMPLES(x)			(((x) >> 4) & 0x7)
-#define   C_028804_PS_ITER_SAMPLES			(~(((~0) & 0x7) << 4))
-#define   S_028804_MASK_EXPORT_NUM_SAMPLES(x)		(((x) & 0x7) << 8)
-#define   G_028804_MASK_EXPORT_NUM_SAMPLES(x)		(((x) >> 8) & 0x7)
-#define   C_028804_MASK_EXPORT_NUM_SAMPLES		(~(((~0) & 0x7) << 8))
-#define   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)		(((x) & 0x7) << 12)
-#define   G_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)		(((x) >> 12) & 0x7)
-#define   C_028804_ALPHA_TO_MASK_NUM_SAMPLES		(~(((~0) & 0x7) << 12))
-#define   S_028804_HIGH_QUALITY_INTERSECTIONS(x)	(((x) & 0x1) << 16)
-#define   G_028804_HIGH_QUALITY_INTERSECTIONS(x)	(((x) >> 16) & 0x1)
-#define   C_028804_HIGH_QUALITY_INTERSECTIONS		(~(((~0) & 0x1) << 16))
-#define   S_028804_INCOHERENT_EQAA_READS(x)		(((x) & 0x1) << 17)
-#define   G_028804_INCOHERENT_EQAA_READS(x)		(((x) >> 17) & 0x1)
-#define   C_028804_INCOHERENT_EQAA_READS		(~(((~0) & 0x1) << 17))
-#define   S_028804_INTERPOLATE_COMP_Z(x)		(((x) & 0x1) << 18)
-#define   G_028804_INTERPOLATE_COMP_Z(x)		(((x) >> 18) & 0x1)
-#define   C_028804_INTERPOLATE_COMP_Z			(~(((~0) >> 18) & 0x1))
-#define   S_028804_INTERPOLATE_SRC_Z(x)			(((x) & 0x1) << 19)
-#define   G_028804_INTERPOLATE_SRC_Z(x)			(((x) >> 19) & 0x1)
-#define   C_028804_INTERPOLATE_SRC_Z			(~(((~0) & 0x1) << 19))
-#define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) & 0x1) << 20)
-#define   G_028804_STATIC_ANCHOR_ASSOCIATIONS(x)	(((x) >> 20) & 0x1)
-#define   C_028804_STATIC_ANCHOR_ASSOCIATIONS		(~(((~0) & 0x1) << 20))
-#define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) & 0x1) << 21)
-#define   G_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)	(((x) >> 21) & 0x1)
-#define   C_028804_ALPHA_TO_MASK_EQAA_DISABLE		(~(((~0) & 0x1) << 21))
+#define   S_028804_MAX_ANCHOR_SAMPLES(x)                              (((x) & 0x7) << 0)
+#define   G_028804_MAX_ANCHOR_SAMPLES(x)                              (((x) >> 0) & 0x07)
+#define   C_028804_MAX_ANCHOR_SAMPLES                                 0xFFFFFFF8
+#define   S_028804_PS_ITER_SAMPLES(x)                                 (((x) & 0x7) << 4)
+#define   G_028804_PS_ITER_SAMPLES(x)                                 (((x) >> 4) & 0x07)
+#define   C_028804_PS_ITER_SAMPLES                                    0xFFFFFF8F
+#define   S_028804_MASK_EXPORT_NUM_SAMPLES(x)                         (((x) & 0x7) << 8)
+#define   G_028804_MASK_EXPORT_NUM_SAMPLES(x)                         (((x) >> 8) & 0x07)
+#define   C_028804_MASK_EXPORT_NUM_SAMPLES                            0xFFFFF8FF
+#define   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)                       (((x) & 0x7) << 12)
+#define   G_028804_ALPHA_TO_MASK_NUM_SAMPLES(x)                       (((x) >> 12) & 0x07)
+#define   C_028804_ALPHA_TO_MASK_NUM_SAMPLES                          0xFFFF8FFF
+#define   S_028804_HIGH_QUALITY_INTERSECTIONS(x)                      (((x) & 0x1) << 16)
+#define   G_028804_HIGH_QUALITY_INTERSECTIONS(x)                      (((x) >> 16) & 0x1)
+#define   C_028804_HIGH_QUALITY_INTERSECTIONS                         0xFFFEFFFF
+#define   S_028804_INCOHERENT_EQAA_READS(x)                           (((x) & 0x1) << 17)
+#define   G_028804_INCOHERENT_EQAA_READS(x)                           (((x) >> 17) & 0x1)
+#define   C_028804_INCOHERENT_EQAA_READS                              0xFFFDFFFF
+#define   S_028804_INTERPOLATE_COMP_Z(x)                              (((x) & 0x1) << 18)
+#define   G_028804_INTERPOLATE_COMP_Z(x)                              (((x) >> 18) & 0x1)
+#define   C_028804_INTERPOLATE_COMP_Z                                 0xFFFBFFFF
+#define   S_028804_INTERPOLATE_SRC_Z(x)                               (((x) & 0x1) << 19)
+#define   G_028804_INTERPOLATE_SRC_Z(x)                               (((x) >> 19) & 0x1)
+#define   C_028804_INTERPOLATE_SRC_Z                                  0xFFF7FFFF
+#define   S_028804_STATIC_ANCHOR_ASSOCIATIONS(x)                      (((x) & 0x1) << 20)
+#define   G_028804_STATIC_ANCHOR_ASSOCIATIONS(x)                      (((x) >> 20) & 0x1)
+#define   C_028804_STATIC_ANCHOR_ASSOCIATIONS                         0xFFEFFFFF
+#define   S_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)                      (((x) & 0x1) << 21)
+#define   G_028804_ALPHA_TO_MASK_EQAA_DISABLE(x)                      (((x) >> 21) & 0x1)
+#define   C_028804_ALPHA_TO_MASK_EQAA_DISABLE                         0xFFDFFFFF
+#define   S_028804_OVERRASTERIZATION_AMOUNT(x)                        (((x) & 0x07) << 24)
+#define   G_028804_OVERRASTERIZATION_AMOUNT(x)                        (((x) >> 24) & 0x07)
+#define   C_028804_OVERRASTERIZATION_AMOUNT                           0xF8FFFFFF
+#define   S_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)                  (((x) & 0x1) << 27)
+#define   G_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)                  (((x) >> 27) & 0x1)
+#define   C_028804_ENABLE_POSTZ_OVERRASTERIZATION                     0xF7FFFFFF
 #define R_028808_CB_COLOR_CONTROL                                       0x028808
 #define   S_028808_DEGAMMA_ENABLE(x)                                  (((x) & 0x1) << 3)
 #define   G_028808_DEGAMMA_ENABLE(x)                                  (((x) >> 3) & 0x1)
@@ -6977,6 +8888,11 @@
 #define   S_02881C_USE_VTX_GS_CUT_FLAG(x)                             (((x) & 0x1) << 25)
 #define   G_02881C_USE_VTX_GS_CUT_FLAG(x)                             (((x) >> 25) & 0x1)
 #define   C_02881C_USE_VTX_GS_CUT_FLAG                                0xFDFFFFFF
+/* VI */
+#define   S_02881C_USE_VTX_LINE_WIDTH(x)                              (((x) & 0x1) << 26)
+#define   G_02881C_USE_VTX_LINE_WIDTH(x)                              (((x) >> 26) & 0x1)
+#define   C_02881C_USE_VTX_LINE_WIDTH                                 0xFBFFFFFF
+/*    */
 #define R_028820_PA_CL_NANINF_CNTL                                      0x028820
 #define   S_028820_VTE_XY_INF_DISCARD(x)                              (((x) & 0x1) << 0)
 #define   G_028820_VTE_XY_INF_DISCARD(x)                              (((x) >> 0) & 0x1)
@@ -7447,9 +9363,21 @@
 #define   S_028A4C_PS_ITER_SAMPLE(x)                                  (((x) & 0x1) << 16)
 #define   G_028A4C_PS_ITER_SAMPLE(x)                                  (((x) >> 16) & 0x1)
 #define   C_028A4C_PS_ITER_SAMPLE                                     0xFFFEFFFF
-#define   S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x)                   (((x) & 0x1) << 17)
-#define   G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x)                   (((x) >> 17) & 0x1)
-#define   C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC                      0xFFFDFFFF
+#define   S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(x)         (((x) & 0x1) << 17)
+#define   G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(x)         (((x) >> 17) & 0x1)
+#define   C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE            0xFFFDFFFF
+#define   S_028A4C_MULTI_GPU_SUPERTILE_ENABLE(x)                      (((x) & 0x1) << 18)
+#define   G_028A4C_MULTI_GPU_SUPERTILE_ENABLE(x)                      (((x) >> 18) & 0x1)
+#define   C_028A4C_MULTI_GPU_SUPERTILE_ENABLE                         0xFFFBFFFF
+#define   S_028A4C_GPU_ID_OVERRIDE_ENABLE(x)                          (((x) & 0x1) << 19)
+#define   G_028A4C_GPU_ID_OVERRIDE_ENABLE(x)                          (((x) >> 19) & 0x1)
+#define   C_028A4C_GPU_ID_OVERRIDE_ENABLE                             0xFFF7FFFF
+#define   S_028A4C_GPU_ID_OVERRIDE(x)                                 (((x) & 0x0F) << 20)
+#define   G_028A4C_GPU_ID_OVERRIDE(x)                                 (((x) >> 20) & 0x0F)
+#define   C_028A4C_GPU_ID_OVERRIDE                                    0xFF0FFFFF
+#define   S_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE(x)                   (((x) & 0x1) << 24)
+#define   G_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE(x)                   (((x) >> 24) & 0x1)
+#define   C_028A4C_MULTI_GPU_PRIM_DISCARD_ENABLE                      0xFEFFFFFF
 #define   S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)                         (((x) & 0x1) << 25)
 #define   G_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)                         (((x) >> 25) & 0x1)
 #define   C_028A4C_FORCE_EOV_CNTDWN_ENABLE                            0xFDFFFFFF
@@ -7515,6 +9443,7 @@
 #define   C_028A7C_INDEX_TYPE                                         0xFFFFFFFC
 #define     V_028A7C_VGT_INDEX_16                                   0x00
 #define     V_028A7C_VGT_INDEX_32                                   0x01
+#define     V_028A7C_VGT_INDEX_8                                    0x02 /* VI */
 #define   S_028A7C_SWAP_MODE(x)                                       (((x) & 0x03) << 2)
 #define   G_028A7C_SWAP_MODE(x)                                       (((x) >> 2) & 0x03)
 #define   C_028A7C_SWAP_MODE                                          0xFFFFFFF3
@@ -7544,6 +9473,12 @@
 #define   G_028A7C_REQ_PATH(x)                                        (((x) >> 10) & 0x1)
 #define   C_028A7C_REQ_PATH                                           0xFFFFFBFF
 /*     */
+/* VI */
+#define   S_028A7C_MTYPE(x)                                           (((x) & 0x03) << 11)
+#define   G_028A7C_MTYPE(x)                                           (((x) >> 11) & 0x03)
+#define   C_028A7C_MTYPE                                              0xFFFFE7FF
+/*    */
+#define R_028A80_WD_ENHANCE                                             0x028A80
 #define R_028A84_VGT_PRIMITIVEID_EN                                     0x028A84
 #define   S_028A84_PRIMITIVEID_EN(x)                                  (((x) & 0x1) << 0)
 #define   G_028A84_PRIMITIVEID_EN(x)                                  (((x) >> 0) & 0x1)
@@ -7642,6 +9577,10 @@
 #define   S_028AA8_WD_SWITCH_ON_EOP(x)                                (((x) & 0x1) << 20)
 #define   G_028AA8_WD_SWITCH_ON_EOP(x)                                (((x) >> 20) & 0x1)
 #define   C_028AA8_WD_SWITCH_ON_EOP                                   0xFFEFFFFF
+/* VI */
+#define   S_028AA8_MAX_PRIMGRP_IN_WAVE(x)                             (((x) & 0x0F) << 28)
+#define   G_028AA8_MAX_PRIMGRP_IN_WAVE(x)                             (((x) >> 28) & 0x0F)
+#define   C_028AA8_MAX_PRIMGRP_IN_WAVE                                0x0FFFFFFF
 /*     */
 #define R_028AAC_VGT_ESGS_RING_ITEMSIZE                                 0x028AAC
 #define   S_028AAC_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
@@ -7681,6 +9620,11 @@
 #define   S_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x)                         (((x) & 0x1) << 16)
 #define   G_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x)                         (((x) >> 16) & 0x1)
 #define   C_028ABC_DST_OUTSIDE_ZERO_TO_ONE                            0xFFFEFFFF
+/* VI */
+#define   S_028ABC_TC_COMPATIBLE(x)                                   (((x) & 0x1) << 17)
+#define   G_028ABC_TC_COMPATIBLE(x)                                   (((x) >> 17) & 0x1)
+#define   C_028ABC_TC_COMPATIBLE                                      0xFFFDFFFF
+/*    */
 #define R_028AC0_DB_SRESULTS_COMPARE_STATE0                             0x028AC0
 #define   S_028AC0_COMPAREFUNC0(x)                                    (((x) & 0x07) << 0)
 #define   G_028AC0_COMPAREFUNC0(x)                                    (((x) >> 0) & 0x07)
@@ -7770,6 +9714,21 @@
 #define   S_028B38_MAX_VERT_OUT(x)                                    (((x) & 0x7FF) << 0)
 #define   G_028B38_MAX_VERT_OUT(x)                                    (((x) >> 0) & 0x7FF)
 #define   C_028B38_MAX_VERT_OUT                                       0xFFFFF800
+/* VI */
+#define R_028B50_VGT_TESS_DISTRIBUTION                                  0x028B50
+#define   S_028B50_ACCUM_ISOLINE(x)                                   (((x) & 0xFF) << 0)
+#define   G_028B50_ACCUM_ISOLINE(x)                                   (((x) >> 0) & 0xFF)
+#define   C_028B50_ACCUM_ISOLINE                                      0xFFFFFF00
+#define   S_028B50_ACCUM_TRI(x)                                       (((x) & 0xFF) << 8)
+#define   G_028B50_ACCUM_TRI(x)                                       (((x) >> 8) & 0xFF)
+#define   C_028B50_ACCUM_TRI                                          0xFFFF00FF
+#define   S_028B50_ACCUM_QUAD(x)                                      (((x) & 0xFF) << 16)
+#define   G_028B50_ACCUM_QUAD(x)                                      (((x) >> 16) & 0xFF)
+#define   C_028B50_ACCUM_QUAD                                         0xFF00FFFF
+#define   S_028B50_DONUT_SPLIT(x)                                     (((x) & 0xFF) << 24)
+#define   G_028B50_DONUT_SPLIT(x)                                     (((x) >> 24) & 0xFF)
+#define   C_028B50_DONUT_SPLIT                                        0x00FFFFFF
+/*    */
 #define R_028B54_VGT_SHADER_STAGES_EN                                   0x028B54
 #define   S_028B54_LS_EN(x)                                           (((x) & 0x03) << 0)
 #define   G_028B54_LS_EN(x)                                           (((x) >> 0) & 0x03)
@@ -7798,6 +9757,20 @@
 #define   S_028B54_DYNAMIC_HS(x)                                      (((x) & 0x1) << 8)
 #define   G_028B54_DYNAMIC_HS(x)                                      (((x) >> 8) & 0x1)
 #define   C_028B54_DYNAMIC_HS                                         0xFFFFFEFF
+/* VI */
+#define   S_028B54_DISPATCH_DRAW_EN(x)                                (((x) & 0x1) << 9)
+#define   G_028B54_DISPATCH_DRAW_EN(x)                                (((x) >> 9) & 0x1)
+#define   C_028B54_DISPATCH_DRAW_EN                                   0xFFFFFDFF
+#define   S_028B54_DIS_DEALLOC_ACCUM_0(x)                             (((x) & 0x1) << 10)
+#define   G_028B54_DIS_DEALLOC_ACCUM_0(x)                             (((x) >> 10) & 0x1)
+#define   C_028B54_DIS_DEALLOC_ACCUM_0                                0xFFFFFBFF
+#define   S_028B54_DIS_DEALLOC_ACCUM_1(x)                             (((x) & 0x1) << 11)
+#define   G_028B54_DIS_DEALLOC_ACCUM_1(x)                             (((x) >> 11) & 0x1)
+#define   C_028B54_DIS_DEALLOC_ACCUM_1                                0xFFFFF7FF
+#define   S_028B54_VS_WAVE_ID_EN(x)                                   (((x) & 0x1) << 12)
+#define   G_028B54_VS_WAVE_ID_EN(x)                                   (((x) >> 12) & 0x1)
+#define   C_028B54_VS_WAVE_ID_EN                                      0xFFFFEFFF
+/*    */
 #define R_028B58_VGT_LS_HS_CONFIG                                       0x028B58
 #define   S_028B58_NUM_PATCHES(x)                                     (((x) & 0xFF) << 0)
 #define   G_028B58_NUM_PATCHES(x)                                     (((x) >> 0) & 0xFF)
@@ -7848,6 +9821,9 @@
 #define   S_028B6C_RESERVED_REDUC_AXIS(x)                             (((x) & 0x1) << 8) /* not on CIK */
 #define   G_028B6C_RESERVED_REDUC_AXIS(x)                             (((x) >> 8) & 0x1) /* not on CIK */
 #define   C_028B6C_RESERVED_REDUC_AXIS                                0xFFFFFEFF /* not on CIK */
+#define   S_028B6C_DEPRECATED(x)                                      (((x) & 0x1) << 9)
+#define   G_028B6C_DEPRECATED(x)                                      (((x) >> 9) & 0x1)
+#define   C_028B6C_DEPRECATED                                         0xFFFFFDFF
 #define   S_028B6C_NUM_DS_WAVES_PER_SIMD(x)                           (((x) & 0x0F) << 10)
 #define   G_028B6C_NUM_DS_WAVES_PER_SIMD(x)                           (((x) >> 10) & 0x0F)
 #define   C_028B6C_NUM_DS_WAVES_PER_SIMD                              0xFFFFC3FF
@@ -7862,6 +9838,14 @@
 #define     V_028B6C_VGT_POLICY_STREAM                              0x01
 #define     V_028B6C_VGT_POLICY_BYPASS                              0x02
 /*     */
+/* VI */
+#define   S_028B6C_DISTRIBUTION_MODE(x)                               (((x) & 0x03) << 17)
+#define   G_028B6C_DISTRIBUTION_MODE(x)                               (((x) >> 17) & 0x03)
+#define   C_028B6C_DISTRIBUTION_MODE                                  0xFFF9FFFF
+#define   S_028B6C_MTYPE(x)                                           (((x) & 0x03) << 19)
+#define   G_028B6C_MTYPE(x)                                           (((x) >> 19) & 0x03)
+#define   C_028B6C_MTYPE                                              0xFFE7FFFF
+/*    */
 #define R_028B70_DB_ALPHA_TO_MASK                                       0x028B70
 #define   S_028B70_ALPHA_TO_MASK_ENABLE(x)                            (((x) & 0x1) << 0)
 #define   G_028B70_ALPHA_TO_MASK_ENABLE(x)                            (((x) >> 0) & 0x1)
@@ -8001,6 +9985,22 @@
 #define   S_028BDC_DX10_DIAMOND_TEST_ENA(x)                           (((x) & 0x1) << 12)
 #define   G_028BDC_DX10_DIAMOND_TEST_ENA(x)                           (((x) >> 12) & 0x1)
 #define   C_028BDC_DX10_DIAMOND_TEST_ENA                              0xFFFFEFFF
+#define R_028BE0_PA_SC_AA_CONFIG                                        0x028BE0
+#define   S_028BE0_MSAA_NUM_SAMPLES(x)                                (((x) & 0x7) << 0)
+#define   G_028BE0_MSAA_NUM_SAMPLES(x)                                (((x) >> 0) & 0x07)
+#define   C_028BE0_MSAA_NUM_SAMPLES                                   0xFFFFFFF8
+#define   S_028BE0_AA_MASK_CENTROID_DTMN(x)                           (((x) & 0x1) << 4)
+#define   G_028BE0_AA_MASK_CENTROID_DTMN(x)                           (((x) >> 4) & 0x1)
+#define   C_028BE0_AA_MASK_CENTROID_DTMN                              0xFFFFFFEF
+#define   S_028BE0_MAX_SAMPLE_DIST(x)                                 (((x) & 0xf) << 13)
+#define   G_028BE0_MAX_SAMPLE_DIST(x)                                 (((x) >> 13) & 0x0F)
+#define   C_028BE0_MAX_SAMPLE_DIST                                    0xFFFE1FFF
+#define   S_028BE0_MSAA_EXPOSED_SAMPLES(x)                            (((x) & 0x7) << 20)
+#define   G_028BE0_MSAA_EXPOSED_SAMPLES(x)                            (((x) >> 20) & 0x07)
+#define   C_028BE0_MSAA_EXPOSED_SAMPLES                               0xFF8FFFFF
+#define   S_028BE0_DETAIL_TO_EXPOSED_MODE(x)                          (((x) & 0x3) << 24)
+#define   G_028BE0_DETAIL_TO_EXPOSED_MODE(x)                          (((x) >> 24) & 0x03)
+#define   C_028BE0_DETAIL_TO_EXPOSED_MODE                             0xFCFFFFFF
 #define R_028BE4_PA_SU_VTX_CNTL                                         0x028BE4
 #define   S_028BE4_PIX_CENTER(x)                                      (((x) & 0x1) << 0)
 #define   G_028BE4_PIX_CENTER(x)                                      (((x) >> 0) & 0x1)
@@ -8569,6 +10569,17 @@
 #define   G_028C70_FMASK_COMPRESSION_DISABLE(x)                       (((x) >> 26) & 0x1)
 #define   C_028C70_FMASK_COMPRESSION_DISABLE                          0xFBFFFFFF
 /*     */
+/* VI */
+#define   S_028C70_FMASK_COMPRESS_1FRAG_ONLY(x)                       (((x) & 0x1) << 27)
+#define   G_028C70_FMASK_COMPRESS_1FRAG_ONLY(x)                       (((x) >> 27) & 0x1)
+#define   C_028C70_FMASK_COMPRESS_1FRAG_ONLY                          0xF7FFFFFF
+#define   S_028C70_DCC_ENABLE(x)                                      (((x) & 0x1) << 28)
+#define   G_028C70_DCC_ENABLE(x)                                      (((x) >> 28) & 0x1)
+#define   C_028C70_DCC_ENABLE                                         0xEFFFFFFF
+#define   S_028C70_CMASK_ADDR_TYPE(x)                                 (((x) & 0x03) << 29)
+#define   G_028C70_CMASK_ADDR_TYPE(x)                                 (((x) >> 29) & 0x03)
+#define   C_028C70_CMASK_ADDR_TYPE                                    0x9FFFFFFF
+/*    */
 #define R_028C74_CB_COLOR0_ATTRIB                                       0x028C74
 #define   S_028C74_TILE_MODE_INDEX(x)                                 (((x) & 0x1F) << 0)
 #define   G_028C74_TILE_MODE_INDEX(x)                                 (((x) >> 0) & 0x1F)
@@ -8576,7 +10587,9 @@
 #define   S_028C74_FMASK_TILE_MODE_INDEX(x)                           (((x) & 0x1F) << 5)
 #define   G_028C74_FMASK_TILE_MODE_INDEX(x)                           (((x) >> 5) & 0x1F)
 #define   C_028C74_FMASK_TILE_MODE_INDEX                              0xFFFFFC1F
-#define   S_028C74_FMASK_BANK_HEIGHT(x)				      (((x) & 0x3) << 10) /* SI errata */
+#define   S_028C74_FMASK_BANK_HEIGHT(x)                               (((x) & 0x03) << 10)
+#define   G_028C74_FMASK_BANK_HEIGHT(x)                               (((x) >> 10) & 0x03)
+#define   C_028C74_FMASK_BANK_HEIGHT                                  0xFFFFF3FF
 #define   S_028C74_NUM_SAMPLES(x)                                     (((x) & 0x07) << 12)
 #define   G_028C74_NUM_SAMPLES(x)                                     (((x) >> 12) & 0x07)
 #define   C_028C74_NUM_SAMPLES                                        0xFFFF8FFF
@@ -8586,6 +10599,36 @@
 #define   S_028C74_FORCE_DST_ALPHA_1(x)                               (((x) & 0x1) << 17)
 #define   G_028C74_FORCE_DST_ALPHA_1(x)                               (((x) >> 17) & 0x1)
 #define   C_028C74_FORCE_DST_ALPHA_1                                  0xFFFDFFFF
+/* VI */
+#define R_028C78_CB_COLOR0_DCC_CONTROL                                  0x028C78
+#define   S_028C78_OVERWRITE_COMBINER_DISABLE(x)                      (((x) & 0x1) << 0)
+#define   G_028C78_OVERWRITE_COMBINER_DISABLE(x)                      (((x) >> 0) & 0x1)
+#define   C_028C78_OVERWRITE_COMBINER_DISABLE                         0xFFFFFFFE
+#define   S_028C78_KEY_CLEAR_ENABLE(x)                                (((x) & 0x1) << 1)
+#define   G_028C78_KEY_CLEAR_ENABLE(x)                                (((x) >> 1) & 0x1)
+#define   C_028C78_KEY_CLEAR_ENABLE                                   0xFFFFFFFD
+#define   S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(x)                     (((x) & 0x03) << 2)
+#define   G_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(x)                     (((x) >> 2) & 0x03)
+#define   C_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE                        0xFFFFFFF3
+#define   S_028C78_MIN_COMPRESSED_BLOCK_SIZE(x)                       (((x) & 0x1) << 4)
+#define   G_028C78_MIN_COMPRESSED_BLOCK_SIZE(x)                       (((x) >> 4) & 0x1)
+#define   C_028C78_MIN_COMPRESSED_BLOCK_SIZE                          0xFFFFFFEF
+#define   S_028C78_MAX_COMPRESSED_BLOCK_SIZE(x)                       (((x) & 0x03) << 5)
+#define   G_028C78_MAX_COMPRESSED_BLOCK_SIZE(x)                       (((x) >> 5) & 0x03)
+#define   C_028C78_MAX_COMPRESSED_BLOCK_SIZE                          0xFFFFFF9F
+#define   S_028C78_COLOR_TRANSFORM(x)                                 (((x) & 0x03) << 7)
+#define   G_028C78_COLOR_TRANSFORM(x)                                 (((x) >> 7) & 0x03)
+#define   C_028C78_COLOR_TRANSFORM                                    0xFFFFFE7F
+#define   S_028C78_INDEPENDENT_64B_BLOCKS(x)                          (((x) & 0x1) << 9)
+#define   G_028C78_INDEPENDENT_64B_BLOCKS(x)                          (((x) >> 9) & 0x1)
+#define   C_028C78_INDEPENDENT_64B_BLOCKS                             0xFFFFFDFF
+#define   S_028C78_LOSSY_RGB_PRECISION(x)                             (((x) & 0x0F) << 10)
+#define   G_028C78_LOSSY_RGB_PRECISION(x)                             (((x) >> 10) & 0x0F)
+#define   C_028C78_LOSSY_RGB_PRECISION                                0xFFFFC3FF
+#define   S_028C78_LOSSY_ALPHA_PRECISION(x)                           (((x) & 0x0F) << 14)
+#define   G_028C78_LOSSY_ALPHA_PRECISION(x)                           (((x) >> 14) & 0x0F)
+#define   C_028C78_LOSSY_ALPHA_PRECISION                              0xFFFC3FFF
+/*    */
 #define R_028C7C_CB_COLOR0_CMASK                                        0x028C7C
 #define R_028C80_CB_COLOR0_CMASK_SLICE                                  0x028C80
 #define   S_028C80_TILE_MAX(x)                                        (((x) & 0x3FFF) << 0)
@@ -8598,90 +10641,105 @@
 #define   C_028C88_TILE_MAX                                           0xFFC00000
 #define R_028C8C_CB_COLOR0_CLEAR_WORD0                                  0x028C8C
 #define R_028C90_CB_COLOR0_CLEAR_WORD1                                  0x028C90
+#define R_028C94_CB_COLOR0_DCC_BASE                                     0x028C94 /* VI */
 #define R_028C9C_CB_COLOR1_BASE                                         0x028C9C
 #define R_028CA0_CB_COLOR1_PITCH                                        0x028CA0
 #define R_028CA4_CB_COLOR1_SLICE                                        0x028CA4
 #define R_028CA8_CB_COLOR1_VIEW                                         0x028CA8
 #define R_028CAC_CB_COLOR1_INFO                                         0x028CAC
 #define R_028CB0_CB_COLOR1_ATTRIB                                       0x028CB0
-#define R_028CD4_CB_COLOR1_CMASK                                        0x028CB8
+#define R_028CB4_CB_COLOR1_DCC_CONTROL                                  0x028CB4 /* VI */
+#define R_028CB8_CB_COLOR1_CMASK                                        0x028CB8
 #define R_028CBC_CB_COLOR1_CMASK_SLICE                                  0x028CBC
 #define R_028CC0_CB_COLOR1_FMASK                                        0x028CC0
 #define R_028CC4_CB_COLOR1_FMASK_SLICE                                  0x028CC4
 #define R_028CC8_CB_COLOR1_CLEAR_WORD0                                  0x028CC8
 #define R_028CCC_CB_COLOR1_CLEAR_WORD1                                  0x028CCC
+#define R_028CD0_CB_COLOR1_DCC_BASE                                     0x028CD0 /* VI */
 #define R_028CD8_CB_COLOR2_BASE                                         0x028CD8
 #define R_028CDC_CB_COLOR2_PITCH                                        0x028CDC
 #define R_028CE0_CB_COLOR2_SLICE                                        0x028CE0
 #define R_028CE4_CB_COLOR2_VIEW                                         0x028CE4
 #define R_028CE8_CB_COLOR2_INFO                                         0x028CE8
 #define R_028CEC_CB_COLOR2_ATTRIB                                       0x028CEC
+#define R_028CF0_CB_COLOR2_DCC_CONTROL                                  0x028CF0 /* VI */
 #define R_028CF4_CB_COLOR2_CMASK                                        0x028CF4
 #define R_028CF8_CB_COLOR2_CMASK_SLICE                                  0x028CF8
 #define R_028CFC_CB_COLOR2_FMASK                                        0x028CFC
 #define R_028D00_CB_COLOR2_FMASK_SLICE                                  0x028D00
 #define R_028D04_CB_COLOR2_CLEAR_WORD0                                  0x028D04
 #define R_028D08_CB_COLOR2_CLEAR_WORD1                                  0x028D08
+#define R_028D0C_CB_COLOR2_DCC_BASE                                     0x028D0C /* VI */
 #define R_028D14_CB_COLOR3_BASE                                         0x028D14
 #define R_028D18_CB_COLOR3_PITCH                                        0x028D18
 #define R_028D1C_CB_COLOR3_SLICE                                        0x028D1C
 #define R_028D20_CB_COLOR3_VIEW                                         0x028D20
 #define R_028D24_CB_COLOR3_INFO                                         0x028D24
 #define R_028D28_CB_COLOR3_ATTRIB                                       0x028D28
+#define R_028D2C_CB_COLOR3_DCC_CONTROL                                  0x028D2C /* VI */
 #define R_028D30_CB_COLOR3_CMASK                                        0x028D30
 #define R_028D34_CB_COLOR3_CMASK_SLICE                                  0x028D34
 #define R_028D38_CB_COLOR3_FMASK                                        0x028D38
 #define R_028D3C_CB_COLOR3_FMASK_SLICE                                  0x028D3C
 #define R_028D40_CB_COLOR3_CLEAR_WORD0                                  0x028D40
 #define R_028D44_CB_COLOR3_CLEAR_WORD1                                  0x028D44
+#define R_028D48_CB_COLOR3_DCC_BASE                                     0x028D48 /* VI */
 #define R_028D50_CB_COLOR4_BASE                                         0x028D50
 #define R_028D54_CB_COLOR4_PITCH                                        0x028D54
 #define R_028D58_CB_COLOR4_SLICE                                        0x028D58
 #define R_028D5C_CB_COLOR4_VIEW                                         0x028D5C
 #define R_028D60_CB_COLOR4_INFO                                         0x028D60
 #define R_028D64_CB_COLOR4_ATTRIB                                       0x028D64
+#define R_028D68_CB_COLOR4_DCC_CONTROL                                  0x028D68 /* VI */
 #define R_028D6C_CB_COLOR4_CMASK                                        0x028D6C
 #define R_028D70_CB_COLOR4_CMASK_SLICE                                  0x028D70
 #define R_028D74_CB_COLOR4_FMASK                                        0x028D74
 #define R_028D78_CB_COLOR4_FMASK_SLICE                                  0x028D78
 #define R_028D7C_CB_COLOR4_CLEAR_WORD0                                  0x028D7C
 #define R_028D80_CB_COLOR4_CLEAR_WORD1                                  0x028D80
+#define R_028D84_CB_COLOR4_DCC_BASE                                     0x028D84 /* VI */
 #define R_028D8C_CB_COLOR5_BASE                                         0x028D8C
 #define R_028D90_CB_COLOR5_PITCH                                        0x028D90
 #define R_028D94_CB_COLOR5_SLICE                                        0x028D94
 #define R_028D98_CB_COLOR5_VIEW                                         0x028D98
 #define R_028D9C_CB_COLOR5_INFO                                         0x028D9C
 #define R_028DA0_CB_COLOR5_ATTRIB                                       0x028DA0
+#define R_028DA4_CB_COLOR5_DCC_CONTROL                                  0x028DA4 /* VI */
 #define R_028DA8_CB_COLOR5_CMASK                                        0x028DA8
 #define R_028DAC_CB_COLOR5_CMASK_SLICE                                  0x028DAC
 #define R_028DB0_CB_COLOR5_FMASK                                        0x028DB0
 #define R_028DB4_CB_COLOR5_FMASK_SLICE                                  0x028DB4
 #define R_028DB8_CB_COLOR5_CLEAR_WORD0                                  0x028DB8
 #define R_028DBC_CB_COLOR5_CLEAR_WORD1                                  0x028DBC
+#define R_028DC0_CB_COLOR5_DCC_BASE                                     0x028DC0 /* VI */
 #define R_028DC8_CB_COLOR6_BASE                                         0x028DC8
 #define R_028DCC_CB_COLOR6_PITCH                                        0x028DCC
 #define R_028DD0_CB_COLOR6_SLICE                                        0x028DD0
 #define R_028DD4_CB_COLOR6_VIEW                                         0x028DD4
 #define R_028DD8_CB_COLOR6_INFO                                         0x028DD8
 #define R_028DDC_CB_COLOR6_ATTRIB                                       0x028DDC
+#define R_028DE0_CB_COLOR6_DCC_CONTROL                                  0x028DE0 /* VI */
 #define R_028DE4_CB_COLOR6_CMASK                                        0x028DE4
 #define R_028DE8_CB_COLOR6_CMASK_SLICE                                  0x028DE8
 #define R_028DEC_CB_COLOR6_FMASK                                        0x028DEC
 #define R_028DF0_CB_COLOR6_FMASK_SLICE                                  0x028DF0
 #define R_028DF4_CB_COLOR6_CLEAR_WORD0                                  0x028DF4
 #define R_028DF8_CB_COLOR6_CLEAR_WORD1                                  0x028DF8
+#define R_028DFC_CB_COLOR6_DCC_BASE                                     0x028DFC /* VI */
 #define R_028E04_CB_COLOR7_BASE                                         0x028E04
 #define R_028E08_CB_COLOR7_PITCH                                        0x028E08
 #define R_028E0C_CB_COLOR7_SLICE                                        0x028E0C
 #define R_028E10_CB_COLOR7_VIEW                                         0x028E10
 #define R_028E14_CB_COLOR7_INFO                                         0x028E14
 #define R_028E18_CB_COLOR7_ATTRIB                                       0x028E18
+#define R_028E1C_CB_COLOR7_DCC_CONTROL                                  0x028E1C /* VI */
 #define R_028E20_CB_COLOR7_CMASK                                        0x028E20
 #define R_028E24_CB_COLOR7_CMASK_SLICE                                  0x028E24
 #define R_028E28_CB_COLOR7_FMASK                                        0x028E28
 #define R_028E2C_CB_COLOR7_FMASK_SLICE                                  0x028E2C
 #define R_028E30_CB_COLOR7_CLEAR_WORD0                                  0x028E30
 #define R_028E34_CB_COLOR7_CLEAR_WORD1                                  0x028E34
+#define R_028E38_CB_COLOR7_DCC_BASE                                     0x028E38 /* VI */
 
 /* SI async DMA packets */
 #define SI_DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) |    \
diff --git a/src/gallium/drivers/rbug/rbug_context.h b/src/gallium/drivers/rbug/rbug_context.h
index 5e7b9d4dee4..e99f6edc523 100644
--- a/src/gallium/drivers/rbug/rbug_context.h
+++ b/src/gallium/drivers/rbug/rbug_context.h
@@ -79,7 +79,7 @@ struct rbug_context {
    struct rbug_list shaders;
 };
 
-static INLINE struct rbug_context *
+static inline struct rbug_context *
 rbug_context(struct pipe_context *pipe)
 {
    return (struct rbug_context *)pipe;
diff --git a/src/gallium/drivers/rbug/rbug_objects.h b/src/gallium/drivers/rbug/rbug_objects.h
index 3fba3334228..02973e07996 100644
--- a/src/gallium/drivers/rbug/rbug_objects.h
+++ b/src/gallium/drivers/rbug/rbug_objects.h
@@ -93,7 +93,7 @@ struct rbug_transfer
 };
 
 
-static INLINE struct rbug_resource *
+static inline struct rbug_resource *
 rbug_resource(struct pipe_resource *_resource)
 {
    if (!_resource)
@@ -102,7 +102,7 @@ rbug_resource(struct pipe_resource *_resource)
    return (struct rbug_resource *)_resource;
 }
 
-static INLINE struct rbug_sampler_view *
+static inline struct rbug_sampler_view *
 rbug_sampler_view(struct pipe_sampler_view *_sampler_view)
 {
    if (!_sampler_view)
@@ -111,7 +111,7 @@ rbug_sampler_view(struct pipe_sampler_view *_sampler_view)
    return (struct rbug_sampler_view *)_sampler_view;
 }
 
-static INLINE struct rbug_surface *
+static inline struct rbug_surface *
 rbug_surface(struct pipe_surface *_surface)
 {
    if (!_surface)
@@ -120,7 +120,7 @@ rbug_surface(struct pipe_surface *_surface)
    return (struct rbug_surface *)_surface;
 }
 
-static INLINE struct rbug_transfer *
+static inline struct rbug_transfer *
 rbug_transfer(struct pipe_transfer *_transfer)
 {
    if (!_transfer)
@@ -129,7 +129,7 @@ rbug_transfer(struct pipe_transfer *_transfer)
    return (struct rbug_transfer *)_transfer;
 }
 
-static INLINE struct rbug_shader *
+static inline struct rbug_shader *
 rbug_shader(void *_state)
 {
    if (!_state)
@@ -137,7 +137,7 @@ rbug_shader(void *_state)
    return (struct rbug_shader *)_state;
 }
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 rbug_resource_unwrap(struct pipe_resource *_resource)
 {
    if (!_resource)
@@ -145,7 +145,7 @@ rbug_resource_unwrap(struct pipe_resource *_resource)
    return rbug_resource(_resource)->resource;
 }
 
-static INLINE struct pipe_sampler_view *
+static inline struct pipe_sampler_view *
 rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
 {
    if (!_sampler_view)
@@ -153,7 +153,7 @@ rbug_sampler_view_unwrap(struct pipe_sampler_view *_sampler_view)
    return rbug_sampler_view(_sampler_view)->sampler_view;
 }
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 rbug_surface_unwrap(struct pipe_surface *_surface)
 {
    if (!_surface)
@@ -161,7 +161,7 @@ rbug_surface_unwrap(struct pipe_surface *_surface)
    return rbug_surface(_surface)->surface;
 }
 
-static INLINE struct pipe_transfer *
+static inline struct pipe_transfer *
 rbug_transfer_unwrap(struct pipe_transfer *_transfer)
 {
    if (!_transfer)
@@ -169,7 +169,7 @@ rbug_transfer_unwrap(struct pipe_transfer *_transfer)
    return rbug_transfer(_transfer)->transfer;
 }
 
-static INLINE void *
+static inline void *
 rbug_shader_unwrap(void *_state)
 {
    struct rbug_shader *shader;
diff --git a/src/gallium/drivers/rbug/rbug_screen.c b/src/gallium/drivers/rbug/rbug_screen.c
index d5a3164e217..7da4e81560a 100644
--- a/src/gallium/drivers/rbug/rbug_screen.c
+++ b/src/gallium/drivers/rbug/rbug_screen.c
@@ -225,17 +225,6 @@ rbug_screen_fence_reference(struct pipe_screen *_screen,
                            fence);
 }
 
-static boolean
-rbug_screen_fence_signalled(struct pipe_screen *_screen,
-                            struct pipe_fence_handle *fence)
-{
-   struct rbug_screen *rb_screen = rbug_screen(_screen);
-   struct pipe_screen *screen = rb_screen->screen;
-
-   return screen->fence_signalled(screen,
-                                  fence);
-}
-
 static boolean
 rbug_screen_fence_finish(struct pipe_screen *_screen,
                          struct pipe_fence_handle *fence,
@@ -288,7 +277,6 @@ rbug_screen_create(struct pipe_screen *screen)
    rb_screen->base.resource_destroy = rbug_screen_resource_destroy;
    rb_screen->base.flush_frontbuffer = rbug_screen_flush_frontbuffer;
    rb_screen->base.fence_reference = rbug_screen_fence_reference;
-   rb_screen->base.fence_signalled = rbug_screen_fence_signalled;
    rb_screen->base.fence_finish = rbug_screen_fence_finish;
 
    rb_screen->screen = screen;
diff --git a/src/gallium/drivers/rbug/rbug_screen.h b/src/gallium/drivers/rbug/rbug_screen.h
index a53afac05e9..fd92374beda 100644
--- a/src/gallium/drivers/rbug/rbug_screen.h
+++ b/src/gallium/drivers/rbug/rbug_screen.h
@@ -60,7 +60,7 @@ struct rbug_screen
    struct rbug_list transfers;
 };
 
-static INLINE struct rbug_screen *
+static inline struct rbug_screen *
 rbug_screen(struct pipe_screen *screen)
 {
    return (struct rbug_screen *)screen;
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 50a73369c1d..577df814b29 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -203,7 +203,7 @@ struct softpipe_context {
 };
 
 
-static INLINE struct softpipe_context *
+static inline struct softpipe_context *
 softpipe_context( struct pipe_context *pipe )
 {
    return (struct softpipe_context *)pipe;
diff --git a/src/gallium/drivers/softpipe/sp_fence.c b/src/gallium/drivers/softpipe/sp_fence.c
index c2897ed1ef8..6168236ec96 100644
--- a/src/gallium/drivers/softpipe/sp_fence.c
+++ b/src/gallium/drivers/softpipe/sp_fence.c
@@ -40,15 +40,6 @@ softpipe_fence_reference(struct pipe_screen *screen,
 }
 
 
-static boolean
-softpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence)
-{
-   assert(fence);
-   return TRUE;
-}
-
-
 static boolean
 softpipe_fence_finish(struct pipe_screen *screen,
                       struct pipe_fence_handle *fence,
@@ -64,5 +55,4 @@ softpipe_init_screen_fence_funcs(struct pipe_screen *screen)
 {
    screen->fence_reference = softpipe_fence_reference;
    screen->fence_finish = softpipe_fence_finish;
-   screen->fence_signalled = softpipe_fence_signalled;
 }
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 369ab6ed8d4..89411777ec9 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -52,7 +52,7 @@ struct sp_exec_fragment_shader
 
 
 /** cast wrapper */
-static INLINE struct sp_exec_fragment_shader *
+static inline struct sp_exec_fragment_shader *
 sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var)
 {
    return (struct sp_exec_fragment_shader *) var;
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 18eca611669..f8a3eacdb37 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -145,7 +145,7 @@ sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 }
 
 
-static INLINE cptrf4 get_vert( const void *vertex_buffer,
+static inline cptrf4 get_vert( const void *vertex_buffer,
                                int index,
                                int stride )
 {
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index a32bd7fd241..5b458450cd8 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -63,7 +63,7 @@ struct blend_quad_stage
 
 
 /** cast wrapper */
-static INLINE struct blend_quad_stage *
+static inline struct blend_quad_stage *
 blend_quad_stage(struct quad_stage *stage)
 {
    return (struct blend_quad_stage *) stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 82c58d04527..395bc70f2cf 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -56,7 +56,7 @@ struct quad_shade_stage
 
 
 /** cast wrapper */
-static INLINE struct quad_shade_stage *
+static inline struct quad_shade_stage *
 quad_shade_stage(struct quad_stage *qs)
 {
    return (struct quad_shade_stage *) qs;
@@ -67,7 +67,7 @@ quad_shade_stage(struct quad_stage *qs)
  * Execute fragment shader for the four fragments in the quad.
  * \return TRUE if quad is alive, FALSE if all four pixels are killed
  */
-static INLINE boolean
+static inline boolean
 shade_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct softpipe_context *softpipe = qs->softpipe;
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index a688d319bb8..0bfd9c3578c 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -234,6 +234,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
       return 1;
    case PIPE_CAP_CLIP_HALFZ:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
    case PIPE_CAP_VERTEXID_NOBASE:
       return 0;
@@ -242,6 +244,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_screen.h b/src/gallium/drivers/softpipe/sp_screen.h
index d39e9f48e80..f0e929111c2 100644
--- a/src/gallium/drivers/softpipe/sp_screen.h
+++ b/src/gallium/drivers/softpipe/sp_screen.h
@@ -49,7 +49,7 @@ struct softpipe_screen {
    boolean use_llvm;
 };
 
-static INLINE struct softpipe_screen *
+static inline struct softpipe_screen *
 softpipe_screen( struct pipe_screen *pipe )
 {
    return (struct softpipe_screen *)pipe;
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 6704015112b..ff3cb9fe5e1 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -125,7 +125,7 @@ struct setup_context {
 /**
  * Clip setup->quad against the scissor/surface bounds.
  */
-static INLINE void
+static inline void
 quad_clip(struct setup_context *setup, struct quad_header *quad)
 {
    const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
@@ -156,7 +156,7 @@ quad_clip(struct setup_context *setup, struct quad_header *quad)
 /**
  * Emit a quad (pass to next stage) with clipping.
  */
-static INLINE void
+static inline void
 clip_emit_quad(struct setup_context *setup, struct quad_header *quad)
 {
    quad_clip( setup, quad );
@@ -178,14 +178,14 @@ clip_emit_quad(struct setup_context *setup, struct quad_header *quad)
  * Given an X or Y coordinate, return the block/quad coordinate that it
  * belongs to.
  */
-static INLINE int
+static inline int
 block(int x)
 {
    return x & ~(2-1);
 }
 
 
-static INLINE int
+static inline int
 block_x(int x)
 {
    return x & ~(16-1);
@@ -1039,7 +1039,7 @@ setup_line_coefficients(struct setup_context *setup,
 /**
  * Plot a pixel in a line segment.
  */
-static INLINE void
+static inline void
 plot(struct setup_context *setup, int x, int y)
 {
    const int iy = y & 1;
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 1010b63de2c..565fca632c6 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -58,7 +58,7 @@
  * of improperly weighted linear-filtered textures.
  * The tests/texwrap.c demo is a good test.
  */
-static INLINE float
+static inline float
 frac(float f)
 {
    return f - floorf(f);
@@ -69,7 +69,7 @@ frac(float f)
 /**
  * Linear interpolation macro
  */
-static INLINE float
+static inline float
 lerp(float a, float v0, float v1)
 {
    return v0 + a * (v1 - v0);
@@ -84,7 +84,7 @@ lerp(float a, float v0, float v1)
  * optimization!  If we find that's not true on some systems, convert
  * to a macro.
  */
-static INLINE float
+static inline float
 lerp_2d(float a, float b,
         float v00, float v10, float v01, float v11)
 {
@@ -97,7 +97,7 @@ lerp_2d(float a, float b,
 /**
  * As above, but 3D interpolation of 8 values.
  */
-static INLINE float
+static inline float
 lerp_3d(float a, float b, float c,
         float v000, float v100, float v010, float v110,
         float v001, float v101, float v011, float v111)
@@ -115,7 +115,7 @@ lerp_3d(float a, float b, float c,
  * value.  To avoid that problem we add a large multiple of the size
  * (rather than using a conditional).
  */
-static INLINE int
+static inline int
 repeat(int coord, unsigned size)
 {
    return (coord + size * 1024) % size;
@@ -486,7 +486,7 @@ wrap_linear_unorm_clamp_to_edge(float s, unsigned size, int offset,
 /**
  * Do coordinate to array index conversion.  For array textures.
  */
-static INLINE int
+static inline int
 coord_to_layer(float coord, unsigned first_layer, unsigned last_layer)
 {
    int c = util_ifloor(coord + 0.5F);
@@ -587,7 +587,7 @@ compute_lambda_vert(const struct sp_sampler_view *sview,
 
 
 
-static INLINE const float *
+static inline const float *
 get_texel_2d_no_border(const struct sp_sampler_view *sp_sview,
                        union tex_tile_address addr, int x, int y)
 {
@@ -603,7 +603,7 @@ get_texel_2d_no_border(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_2d(const struct sp_sampler_view *sp_sview,
              const struct sp_sampler *sp_samp,
              union tex_tile_address addr, int x, int y)
@@ -695,7 +695,7 @@ static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = {
      PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y }
 };
 
-static INLINE unsigned
+static inline unsigned
 get_next_face(unsigned face, int idx)
 {
    return face_array[face][idx];
@@ -705,7 +705,7 @@ get_next_face(unsigned face, int idx)
  * return a new xcoord based on old face, old coords, cube size
  * and fall_off_index (0 for x-, 1 for x+, 2 for y-, 3 for y+)
  */
-static INLINE int
+static inline int
 get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 {
    if ((face == 0 && fall_off_index != 1) ||
@@ -743,7 +743,7 @@ get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
  * return a new ycoord based on old face, old coords, cube size
  * and fall_off_index (0 for x-, 1 for x+, 2 for y-, 3 for y+)
  */
-static INLINE int
+static inline int
 get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 {
    if ((fall_off_index <= 1) && (face <= 1 || face >= 4)) {
@@ -771,7 +771,7 @@ get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 
 /* Gather a quad of adjacent texels within a tile:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_view *sp_sview,
                                         union tex_tile_address addr,
                                         unsigned x, unsigned y,
@@ -795,7 +795,7 @@ get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_view *sp_sview,
 
 /* Gather a quad of potentially non-adjacent texels:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d_no_border(const struct sp_sampler_view *sp_sview,
                             union tex_tile_address addr,
                             int x0, int y0,
@@ -810,7 +810,7 @@ get_texel_quad_2d_no_border(const struct sp_sampler_view *sp_sview,
 
 /* Can involve a lot of unnecessary checks for border color:
  */
-static INLINE void
+static inline void
 get_texel_quad_2d(const struct sp_sampler_view *sp_sview,
                   const struct sp_sampler *sp_samp,
                   union tex_tile_address addr,
@@ -828,7 +828,7 @@ get_texel_quad_2d(const struct sp_sampler_view *sp_sview,
 
 /* 3d variants:
  */
-static INLINE const float *
+static inline const float *
 get_texel_3d_no_border(const struct sp_sampler_view *sp_sview,
                        union tex_tile_address addr, int x, int y, int z)
 {
@@ -846,7 +846,7 @@ get_texel_3d_no_border(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_3d(const struct sp_sampler_view *sp_sview,
              const struct sp_sampler *sp_samp,
              union tex_tile_address addr, int x, int y, int z)
@@ -866,7 +866,7 @@ get_texel_3d(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for 1D array texture */
-static INLINE const float *
+static inline const float *
 get_texel_1d_array(const struct sp_sampler_view *sp_sview,
                    const struct sp_sampler *sp_samp,
                    union tex_tile_address addr, int x, int y)
@@ -884,7 +884,7 @@ get_texel_1d_array(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for 2D array texture */
-static INLINE const float *
+static inline const float *
 get_texel_2d_array(const struct sp_sampler_view *sp_sview,
                    const struct sp_sampler *sp_samp,
                    union tex_tile_address addr, int x, int y, int layer)
@@ -905,7 +905,7 @@ get_texel_2d_array(const struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE const float *
+static inline const float *
 get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
                         union tex_tile_address addr, int x, int y,
                         float *corner, int layer, unsigned face)
@@ -960,7 +960,7 @@ get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
 
 
 /* Get texel pointer for cube array texture */
-static INLINE const float *
+static inline const float *
 get_texel_cube_array(const struct sp_sampler_view *sp_sview,
                      const struct sp_sampler *sp_samp,
                      union tex_tile_address addr, int x, int y, int layer)
@@ -986,7 +986,7 @@ get_texel_cube_array(const struct sp_sampler_view *sp_sview,
  * If level = 2, then we'll return 64 (the width at level=2).
  * Return 1 if level > base_pot.
  */
-static INLINE unsigned
+static inline unsigned
 pot_level_size(unsigned base_pot, unsigned level)
 {
    return (base_pot >= level) ? (1 << (base_pot - level)) : 1;
@@ -1016,7 +1016,7 @@ print_sample_4(const char *function, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZ
 
 /* Some image-filter fastpaths:
  */
-static INLINE void
+static inline void
 img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
                                 const struct img_filter_args *args,
@@ -1070,7 +1070,7 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE void
+static inline void
 img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
                                  struct sp_sampler *sp_samp,
                                  const struct img_filter_args *args,
@@ -1104,7 +1104,7 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE void
+static inline void
 img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
                                 const struct img_filter_args *args,
@@ -1819,7 +1819,7 @@ img_filter_3d_linear(struct sp_sampler_view *sp_sview,
  * \param lod_in per-fragment lod_bias or explicit_lod.
  * \param lod returns the per-fragment lod.
  */
-static INLINE void
+static inline void
 compute_lod(const struct pipe_sampler_state *sampler,
             enum tgsi_sampler_control control,
             const float biased_lambda,
@@ -1859,7 +1859,7 @@ compute_lod(const struct pipe_sampler_state *sampler,
  * \param lod_in per-fragment lod_bias or explicit_lod.
  * \param lod results per-fragment lod.
  */
-static INLINE void
+static inline void
 compute_lambda_lod(struct sp_sampler_view *sp_sview,
                    struct sp_sampler *sp_samp,
                    const float s[TGSI_QUAD_SIZE],
@@ -1906,7 +1906,7 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview,
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 get_gather_component(const float lod_in[TGSI_QUAD_SIZE])
 {
    /* gather component is stored in lod_in slot as unsigned */
@@ -2789,7 +2789,7 @@ get_linear_wrap(unsigned mode)
 /**
  * Is swizzling needed for the given state key?
  */
-static INLINE bool
+static inline bool
 any_swizzle(const struct pipe_sampler_view *view)
 {
    return (view->swizzle_r != PIPE_SWIZZLE_RED ||
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
index 4a421a8f882..21f38b2f859 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -185,7 +185,7 @@ sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
  * This is basically a direct-map cache.
  * XXX There's probably lots of ways in which we can improve this.
  */
-static INLINE uint
+static inline uint
 tex_cache_pos( union tex_tile_address addr )
 {
    uint entry = (addr.bits.x + 
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
index 2233effc439..b7ad222d715 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -127,7 +127,7 @@ extern const struct softpipe_tex_cached_tile *
 sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
                         union tex_tile_address addr );
 
-static INLINE union tex_tile_address
+static inline union tex_tile_address
 tex_tile_address( unsigned x,
                   unsigned y,
                   unsigned z,
@@ -147,7 +147,7 @@ tex_tile_address( unsigned x,
 
 /* Quickly retrieve tile if it matches last lookup.
  */
-static INLINE const struct softpipe_tex_cached_tile *
+static inline const struct softpipe_tex_cached_tile *
 sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
                        union tex_tile_address addr )
 {
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index 1701bf574d9..fbf741a9c72 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -81,13 +81,13 @@ struct softpipe_transfer
 
 
 /** cast wrappers */
-static INLINE struct softpipe_resource *
+static inline struct softpipe_resource *
 softpipe_resource(struct pipe_resource *pt)
 {
    return (struct softpipe_resource *) pt;
 }
 
-static INLINE struct softpipe_transfer *
+static inline struct softpipe_transfer *
 softpipe_transfer(struct pipe_transfer *pt)
 {
    return (struct softpipe_transfer *) pt;
@@ -99,7 +99,7 @@ softpipe_transfer(struct pipe_transfer *pt)
  * This is a short-cut instead of using map()/unmap(), which should
  * probably be fixed.
  */
-static INLINE void *
+static inline void *
 softpipe_resource_data(struct pipe_resource *pt)
 {
    if (!pt)
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index b763f526e61..9cc8ac12525 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -52,7 +52,7 @@ sp_alloc_tile(struct softpipe_tile_cache *tc);
    (((x) + (y) * 5 + (l) * 10) % NUM_ENTRIES)
 
 
-static INLINE int addr_to_clear_pos(union tile_address addr)
+static inline int addr_to_clear_pos(union tile_address addr)
 {
    int pos;
    pos = addr.bits.layer * (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE);
@@ -63,7 +63,7 @@ static INLINE int addr_to_clear_pos(union tile_address addr)
 /**
  * Is the tile at (x,y) in cleared state?
  */
-static INLINE uint
+static inline uint
 is_clear_flag_set(const uint *bitvec, union tile_address addr, unsigned max)
 {
    int pos, bit;
@@ -77,7 +77,7 @@ is_clear_flag_set(const uint *bitvec, union tile_address addr, unsigned max)
 /**
  * Mark the tile at (x,y) as not cleared.
  */
-static INLINE void
+static inline void
 clear_clear_flag(uint *bitvec, union tile_address addr, unsigned max)
 {
    int pos;
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index 167e1ffcada..2c0bafad651 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -128,7 +128,7 @@ sp_find_cached_tile(struct softpipe_tile_cache *tc,
                     union tile_address addr );
 
 
-static INLINE union tile_address
+static inline union tile_address
 tile_address( unsigned x,
               unsigned y, unsigned layer )
 {
@@ -143,7 +143,7 @@ tile_address( unsigned x,
 
 /* Quickly retrieve tile if it matches last lookup.
  */
-static INLINE struct softpipe_cached_tile *
+static inline struct softpipe_cached_tile *
 sp_get_cached_tile(struct softpipe_tile_cache *tc, 
                    int x, int y, int layer )
 {
diff --git a/src/gallium/drivers/svga/Makefile.am b/src/gallium/drivers/svga/Makefile.am
index e0a8cad7208..d46de95e4b4 100644
--- a/src/gallium/drivers/svga/Makefile.am
+++ b/src/gallium/drivers/svga/Makefile.am
@@ -20,8 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/drivers/svga/SConscript b/src/gallium/drivers/svga/SConscript
index bb4d034f1eb..0ee624616f9 100644
--- a/src/gallium/drivers/svga/SConscript
+++ b/src/gallium/drivers/svga/SConscript
@@ -11,7 +11,6 @@ if env['suncc']:
 if env['gcc'] or env['clang']:
 	env.Append(CPPDEFINES = [
 		'HAVE_STDINT_H', 
-		'HAVE_SYS_TYPES_H',
 	])
 	
 env.Prepend(CPPPATH = [
diff --git a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
index 355edfdb702..5e00906ce36 100644
--- a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
+++ b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
@@ -507,7 +507,7 @@ static const uint32 SVGA3D_OUTPUT_REG_DEPTH_NUM_PS20 = 1;
  *----------------------------------------------------------------------
  */
 
-static INLINE SVGA3dShaderRegType
+static inline SVGA3dShaderRegType
 SVGA3dShaderGetRegType(uint32 token)
 {
    SVGA3dShaderSrcToken src;
diff --git a/src/gallium/drivers/svga/include/svga_overlay.h b/src/gallium/drivers/svga/include/svga_overlay.h
index 0f242dd402c..ccbf7912e6d 100644
--- a/src/gallium/drivers/svga/include/svga_overlay.h
+++ b/src/gallium/drivers/svga/include/svga_overlay.h
@@ -133,7 +133,7 @@ struct {
  *----------------------------------------------------------------------
  */
 
-static INLINE Bool
+static inline Bool
 VMwareVideoGetAttributes(const SVGAOverlayFormat format,    // IN
                          uint32 *width,                     // IN / OUT
                          uint32 *height,                    // IN / OUT
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 474b75c3c86..b271832171d 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -57,7 +57,7 @@
  *----------------------------------------------------------------------
  */
 
-static INLINE void
+static inline void
 surface_to_surfaceid(struct svga_winsys_context *swc, // IN
                      struct pipe_surface *surface,    // IN
                      SVGA3dSurfaceImageId *id,        // OUT
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 630f5f77d66..71f038df8c1 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -485,20 +485,20 @@ svga_context_create(struct pipe_screen *screen,
  * Inline conversion functions.  These are better-typed than the
  * macros used previously:
  */
-static INLINE struct svga_context *
+static inline struct svga_context *
 svga_context( struct pipe_context *pipe )
 {
    return (struct svga_context *)pipe;
 }
 
 
-static INLINE boolean
+static inline boolean
 svga_have_gb_objects(const struct svga_context *svga)
 {
    return svga_screen(svga->pipe.screen)->sws->have_gb_objects;
 }
 
-static INLINE boolean
+static inline boolean
 svga_have_gb_dma(const struct svga_context *svga)
 {
    return svga_screen(svga->pipe.screen)->sws->have_gb_dma;
diff --git a/src/gallium/drivers/svga/svga_debug.h b/src/gallium/drivers/svga/svga_debug.h
index 3a3fcd8fae2..82c9b602d5d 100644
--- a/src/gallium/drivers/svga/svga_debug.h
+++ b/src/gallium/drivers/svga/svga_debug.h
@@ -53,7 +53,7 @@ extern int SVGA_DEBUG;
 #define DBSTR(x) ""
 #endif
 
-static INLINE void
+static inline void
 SVGA_DBG( unsigned flag, const char *fmt, ... )
 {
 #ifdef DEBUG 
diff --git a/src/gallium/drivers/svga/svga_draw_private.h b/src/gallium/drivers/svga/svga_draw_private.h
index 1b054038e9f..9ab87e8259a 100644
--- a/src/gallium/drivers/svga/svga_draw_private.h
+++ b/src/gallium/drivers/svga/svga_draw_private.h
@@ -57,7 +57,7 @@ static const unsigned svga_hw_prims =
  * PIPE_PRIM_QUADS, PIPE_PRIM_QUAD_STRIP or PIPE_PRIM_POLYGON.  We convert
  * those to other types of primitives with index/translation code.
  */
-static INLINE unsigned
+static inline unsigned
 svga_translate_prim(unsigned mode, unsigned vcount,unsigned *prim_count)
 {
    switch (mode) {
diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
index 594eec7166e..2890516c0cf 100644
--- a/src/gallium/drivers/svga/svga_pipe_blend.c
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -33,7 +33,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_blend_factor(unsigned factor)
 {
    switch (factor) {
@@ -58,7 +58,7 @@ svga_translate_blend_factor(unsigned factor)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_blend_func(unsigned mode)
 {
    switch (mode) {
diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
index cb07dbe09a3..8db21fd7476 100644
--- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -32,7 +32,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_compare_func(unsigned func)
 {
    switch (func) {
@@ -50,7 +50,7 @@ svga_translate_compare_func(unsigned func)
    }
 }
 
-static INLINE unsigned
+static inline unsigned
 svga_translate_stencil_op(unsigned op)
 {
    switch (op) {
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index a97a9c46cf8..208a2cd14bf 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -59,7 +59,7 @@ struct svga_query {
 
 
 /** cast wrapper */
-static INLINE struct svga_query *
+static inline struct svga_query *
 svga_query( struct pipe_query *q )
 {
    return (struct svga_query *)q;
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 8a87bb467aa..effd490dd22 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -35,7 +35,7 @@
 
 #include "svga_debug.h"
 
-static INLINE unsigned
+static inline unsigned
 translate_wrap_mode(unsigned wrap)
 {
    switch (wrap) {
@@ -68,7 +68,7 @@ translate_wrap_mode(unsigned wrap)
    }
 }
 
-static INLINE unsigned translate_img_filter( unsigned filter )
+static inline unsigned translate_img_filter( unsigned filter )
 {
    switch (filter) {
    case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
@@ -79,7 +79,7 @@ static INLINE unsigned translate_img_filter( unsigned filter )
    }
 }
 
-static INLINE unsigned translate_mip_filter( unsigned filter )
+static inline unsigned translate_mip_filter( unsigned filter )
 {
    switch (filter) {
    case PIPE_TEX_MIPFILTER_NONE:    return SVGA3D_TEX_FILTER_NONE;
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index d2c7762e7ff..13f85cddbd5 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -45,7 +45,7 @@
  * Vertex and index buffers need hardware backing.  Constant buffers
  * do not.  No other types of buffers currently supported.
  */
-static INLINE boolean
+static inline boolean
 svga_buffer_needs_hw_storage(unsigned usage)
 {
    return usage & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER);
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h
index 83b3d342aec..e838beb6661 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.h
+++ b/src/gallium/drivers/svga/svga_resource_buffer.h
@@ -190,7 +190,7 @@ struct svga_buffer
 };
 
 
-static INLINE struct svga_buffer *
+static inline struct svga_buffer *
 svga_buffer(struct pipe_resource *buffer)
 {
    if (buffer) {
@@ -205,7 +205,7 @@ svga_buffer(struct pipe_resource *buffer)
  * Returns TRUE for user buffers.  We may
  * decide to use an alternate upload path for these buffers.
  */
-static INLINE boolean 
+static inline boolean 
 svga_buffer_is_user_buffer( struct pipe_resource *buffer )
 {
    if (buffer) {
@@ -219,7 +219,7 @@ svga_buffer_is_user_buffer( struct pipe_resource *buffer )
  * Returns a pointer to a struct svga_winsys_screen given a
  * struct svga_buffer.
  */
-static INLINE struct svga_winsys_screen *
+static inline struct svga_winsys_screen *
 svga_buffer_winsys_screen(struct svga_buffer *sbuf)
 {
    return svga_screen(sbuf->b.b.screen)->sws;
@@ -230,7 +230,7 @@ svga_buffer_winsys_screen(struct svga_buffer *sbuf)
  * Returns whether a buffer has hardware storage that is
  * visible to the GPU.
  */
-static INLINE boolean
+static inline boolean
 svga_buffer_has_hw_storage(struct svga_buffer *sbuf)
 {
    if (svga_buffer_winsys_screen(sbuf)->have_gb_objects)
@@ -242,7 +242,7 @@ svga_buffer_has_hw_storage(struct svga_buffer *sbuf)
 /**
  * Map the hardware storage of a buffer.
  */
-static INLINE void *
+static inline void *
 svga_buffer_hw_storage_map(struct svga_context *svga,
                            struct svga_buffer *sbuf,
                            unsigned flags, boolean *retry)
@@ -259,7 +259,7 @@ svga_buffer_hw_storage_map(struct svga_context *svga,
 /**
  * Unmap the hardware storage of a buffer.
  */
-static INLINE void
+static inline void
 svga_buffer_hw_storage_unmap(struct svga_context *svga,
                              struct svga_buffer *sbuf)
 {
diff --git a/src/gallium/drivers/svga/svga_resource_texture.h b/src/gallium/drivers/svga/svga_resource_texture.h
index 1ff42fabab9..19dadfb8828 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.h
+++ b/src/gallium/drivers/svga/svga_resource_texture.h
@@ -106,7 +106,7 @@ struct svga_transfer
 };
 
 
-static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource )
+static inline struct svga_texture *svga_texture( struct pipe_resource *resource )
 {
    struct svga_texture *tex = (struct svga_texture *)resource;
    assert(tex == NULL || tex->b.vtbl == &svga_texture_vtbl);
@@ -114,7 +114,7 @@ static INLINE struct svga_texture *svga_texture( struct pipe_resource *resource
 }
 
 
-static INLINE struct svga_transfer *
+static inline struct svga_transfer *
 svga_transfer(struct pipe_transfer *transfer)
 {
    assert(transfer);
@@ -127,7 +127,7 @@ svga_transfer(struct pipe_transfer *transfer)
  * This is used to track updates to textures when we draw into
  * them via a surface.
  */
-static INLINE void
+static inline void
 svga_age_texture_view(struct svga_texture *tex, unsigned level)
 {
    assert(level < Elements(tex->view_age));
@@ -138,7 +138,7 @@ svga_age_texture_view(struct svga_texture *tex, unsigned level)
 /**
  * Mark the given texture face/level as being defined.
  */
-static INLINE void
+static inline void
 svga_define_texture_level(struct svga_texture *tex,
                           unsigned face,unsigned level)
 {
@@ -148,7 +148,7 @@ svga_define_texture_level(struct svga_texture *tex,
 }
 
 
-static INLINE bool
+static inline bool
 svga_is_texture_level_defined(const struct svga_texture *tex,
                               unsigned face, unsigned level)
 {
@@ -177,7 +177,7 @@ check_face_level(const struct svga_texture *tex,
 }
 
 
-static INLINE void
+static inline void
 svga_set_texture_rendered_to(struct svga_texture *tex,
                              unsigned face, unsigned level)
 {
@@ -186,7 +186,7 @@ svga_set_texture_rendered_to(struct svga_texture *tex,
 }
 
 
-static INLINE void
+static inline void
 svga_clear_texture_rendered_to(struct svga_texture *tex,
                                unsigned face, unsigned level)
 {
@@ -195,7 +195,7 @@ svga_clear_texture_rendered_to(struct svga_texture *tex,
 }
 
 
-static INLINE boolean
+static inline boolean
 svga_was_texture_rendered_to(const struct svga_texture *tex,
                              unsigned face, unsigned level)
 {
diff --git a/src/gallium/drivers/svga/svga_sampler_view.h b/src/gallium/drivers/svga/svga_sampler_view.h
index 2087c1be85e..7f14323f84f 100644
--- a/src/gallium/drivers/svga/svga_sampler_view.h
+++ b/src/gallium/drivers/svga/svga_sampler_view.h
@@ -86,7 +86,7 @@ svga_destroy_sampler_view_priv(struct svga_sampler_view *v);
 void
 svga_debug_describe_sampler_view(char *buf, const struct svga_sampler_view *sv);
 
-static INLINE void
+static inline void
 svga_sampler_view_reference(struct svga_sampler_view **ptr, struct svga_sampler_view *v)
 {
    struct svga_sampler_view *old = *ptr;
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 56e486786df..66c3deaa9e7 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -309,6 +309,10 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_UMA:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return 0;
    }
 
@@ -443,7 +447,9 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
       return 0;
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_COMPUTE:
-      /* no support for geometry or compute shaders at this time */
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+      /* no support for geometry, tess or compute shaders at this time */
       return 0;
    default:
       debug_printf("Unexpected shader type (%u) query\n", shader);
@@ -542,15 +548,6 @@ svga_fence_reference(struct pipe_screen *screen,
 }
 
 
-static boolean
-svga_fence_signalled(struct pipe_screen *screen,
-                     struct pipe_fence_handle *fence)
-{
-   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
-   return sws->fence_signalled(sws, fence, 0) == 0;
-}
-
-
 static boolean
 svga_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
@@ -558,6 +555,9 @@ svga_fence_finish(struct pipe_screen *screen,
 {
    struct svga_winsys_screen *sws = svga_screen(screen)->sws;
 
+   if (!timeout)
+      return sws->fence_signalled(sws, fence, 0) == 0;
+
    SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n",
             __FUNCTION__, fence);
 
@@ -645,7 +645,6 @@ svga_screen_create(struct svga_winsys_screen *sws)
    screen->is_format_supported = svga_is_format_supported;
    screen->context_create = svga_context_create;
    screen->fence_reference = svga_fence_reference;
-   screen->fence_signalled = svga_fence_signalled;
    screen->fence_finish = svga_fence_finish;
    screen->get_driver_query_info = svga_get_driver_query_info;
    svgascreen->sws = sws;
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
index b85191c4b26..ea1e743dfe5 100644
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -82,7 +82,7 @@ struct svga_screen
 
 #ifndef DEBUG
 /** cast wrapper */
-static INLINE struct svga_screen *
+static inline struct svga_screen *
 svga_screen(struct pipe_screen *pscreen)
 {
    return (struct svga_screen *) pscreen;
diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c
index f63f7836187..3c765394a88 100644
--- a/src/gallium/drivers/svga/svga_screen_cache.c
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -76,7 +76,7 @@ surface_size(const struct svga_host_surface_cache_key *key)
 /**
  * Compute the bucket for this key.
  */
-static INLINE unsigned
+static inline unsigned
 svga_screen_cache_bucket(const struct svga_host_surface_cache_key *key)
 {
    return util_hash_crc32(key, sizeof *key) % SVGA_HOST_SURFACE_CACHE_BUCKETS;
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index fd500ae4401..5102159b96a 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -44,7 +44,7 @@ svga_destroy_shader_variant(struct svga_context *svga,
 /**
  * Check if a shader's bytecode exceeds the device limits.
  */
-static INLINE boolean
+static inline boolean
 svga_shader_too_large(const struct svga_context *svga,
                       const struct svga_shader_variant *variant)
 {
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index 566a79407e5..8cdce742b3b 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -41,7 +41,7 @@
 
 
 
-static INLINE int
+static inline int
 compare_fs_keys(const struct svga_fs_compile_key *a,
                 const struct svga_fs_compile_key *b)
 {
diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c
index fb56b3d36ba..ebb98373e2b 100644
--- a/src/gallium/drivers/svga/svga_state_rss.c
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -61,7 +61,7 @@ do {                                                            \
 } while (0)
 
 
-static INLINE void
+static inline void
 svga_queue_rs( struct rs_queue *q,
                unsigned rss,
                unsigned value )
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
index 0ab571c0588..41334bd7cb9 100644
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -274,7 +274,7 @@ do {                                                                    \
 } while (0)
 
 
-static INLINE void 
+static inline void 
 svga_queue_tss( struct ts_queue *q,
                 unsigned unit,
                 unsigned tss,
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 545c9d7420f..c2a0f1ee6b1 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -41,7 +41,7 @@
 #include "svga_hw_reg.h"
 
 
-static INLINE int
+static inline int
 compare_vs_keys(const struct svga_vs_compile_key *a,
                 const struct svga_vs_compile_key *b)
 {
diff --git a/src/gallium/drivers/svga/svga_surface.h b/src/gallium/drivers/svga/svga_surface.h
index 7b8f6f018d2..2fa72a1c8f0 100644
--- a/src/gallium/drivers/svga/svga_surface.h
+++ b/src/gallium/drivers/svga/svga_surface.h
@@ -84,7 +84,7 @@ svga_texture_copy_handle(struct svga_context *svga,
                          unsigned width, unsigned height, unsigned depth);
 
 
-static INLINE struct svga_surface *
+static inline struct svga_surface *
 svga_surface(struct pipe_surface *surface)
 {
    assert(surface);
@@ -92,7 +92,7 @@ svga_surface(struct pipe_surface *surface)
 }
 
 
-static INLINE const struct svga_surface *
+static inline const struct svga_surface *
 svga_surface_const(const struct pipe_surface *surface)
 {
    assert(surface);
diff --git a/src/gallium/drivers/svga/svga_swtnl_private.h b/src/gallium/drivers/svga/svga_swtnl_private.h
index 608950d7af6..e2106e1e8e6 100644
--- a/src/gallium/drivers/svga/svga_swtnl_private.h
+++ b/src/gallium/drivers/svga/svga_swtnl_private.h
@@ -76,7 +76,7 @@ struct svga_vbuf_render {
 /**
  * Basically a cast wrapper.
  */
-static INLINE struct svga_vbuf_render *
+static inline struct svga_vbuf_render *
 svga_vbuf_render( struct vbuf_render *render )
 {
    assert(render);
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index 9aafd851264..2e2ff5e4673 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -84,7 +84,7 @@ svga_shader_expand(struct svga_shader_emitter *emit)
 }
 
 
-static INLINE boolean
+static inline boolean
 reserve(struct svga_shader_emitter *emit, unsigned nr_dwords)
 {
    if (emit->ptr - emit->buf + nr_dwords * sizeof(unsigned) >= emit->size) {
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index e7a2a134ca5..5c47a4ad39f 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -124,7 +124,7 @@ struct svga_shader_variant
  * The real use of this information is matching vertex elements to
  * fragment shader inputs in the case where vertex shader is disabled.
  */
-static INLINE void svga_generate_vdecl_semantics( unsigned idx,
+static inline void svga_generate_vdecl_semantics( unsigned idx,
                                                   unsigned *usage,
                                                   unsigned *usage_index )
 {
@@ -140,12 +140,12 @@ static INLINE void svga_generate_vdecl_semantics( unsigned idx,
 
 
 
-static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
+static inline unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
 {
    return sizeof *key;
 }
 
-static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
+static inline unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
 {
    return (const char *)&key->tex[key->num_textures] - (const char *)key;
 }
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 1894296e6d7..1a1dac23507 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -167,7 +167,7 @@ svga_translate_decl_sm30(struct svga_shader_emitter *emit,
 
 
 /** Emit the given SVGA3dShaderInstToken opcode */
-static INLINE boolean
+static inline boolean
 emit_instruction(struct svga_shader_emitter *emit,
                  SVGA3dShaderInstToken opcode)
 {
@@ -176,7 +176,7 @@ emit_instruction(struct svga_shader_emitter *emit,
 
 
 /** Generate a SVGA3dShaderInstToken for the given SVGA3D shader opcode */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token(unsigned opcode)
 {
    SVGA3dShaderInstToken inst;
@@ -192,7 +192,7 @@ inst_token(unsigned opcode)
  * Generate a SVGA3dShaderInstToken for the given SVGA3D shader opcode
  * with the predication flag set.
  */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token_predicated(unsigned opcode)
 {
    SVGA3dShaderInstToken inst;
@@ -209,7 +209,7 @@ inst_token_predicated(unsigned opcode)
  * Generate a SVGA3dShaderInstToken for a SETP instruction (set predicate)
  * using the given comparison operator (one of SVGA3DOPCOMP_xx).
  */
-static INLINE SVGA3dShaderInstToken
+static inline SVGA3dShaderInstToken
 inst_token_setp(unsigned operator)
 {
    SVGA3dShaderInstToken inst;
@@ -227,7 +227,7 @@ inst_token_setp(unsigned operator)
  * Note that this function is used to create tokens for output registers,
  * temp registers AND constants (see emit_def_const()).
  */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 dst_register(unsigned file, int number)
 {
    SVGA3dShaderDestToken dest;
@@ -255,7 +255,7 @@ dst_register(unsigned file, int number)
  * Apply a writemask to the given SVGA3dShaderDestToken, returning a
  * new SVGA3dShaderDestToken.
  */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 writemask(SVGA3dShaderDestToken dest, unsigned mask)
 {
    assert(dest.mask & mask);
@@ -265,7 +265,7 @@ writemask(SVGA3dShaderDestToken dest, unsigned mask)
 
 
 /** Create a SVGA3dShaderSrcToken given a register file and number */
-static INLINE SVGA3dShaderSrcToken
+static inline SVGA3dShaderSrcToken
 src_token(unsigned file, int number)
 {
    SVGA3dShaderSrcToken src;
@@ -289,7 +289,7 @@ src_token(unsigned file, int number)
 
 
 /** Create a src_register given a register file and register number */
-static INLINE struct src_register
+static inline struct src_register
 src_register(unsigned file, int number)
 {
    struct src_register src;
@@ -301,7 +301,7 @@ src_register(unsigned file, int number)
 }
 
 /** Translate src_register into SVGA3dShaderDestToken */
-static INLINE SVGA3dShaderDestToken
+static inline SVGA3dShaderDestToken
 dst(struct src_register src)
 {
    return dst_register(SVGA3dShaderGetRegType(src.base.value), src.base.num);
@@ -309,7 +309,7 @@ dst(struct src_register src)
 
 
 /** Translate SVGA3dShaderDestToken to a src_register */
-static INLINE struct src_register
+static inline struct src_register
 src(SVGA3dShaderDestToken dst)
 {
    return src_register(SVGA3dShaderGetRegType(dst.value), dst.num);
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader.h b/src/gallium/drivers/svga/svgadump/svga_shader.h
index 5db64bf135b..0a2e3d5f345 100644
--- a/src/gallium/drivers/svga/svgadump/svga_shader.h
+++ b/src/gallium/drivers/svga/svgadump/svga_shader.h
@@ -56,7 +56,7 @@ struct sh_reg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_reg_type( struct sh_reg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
@@ -138,7 +138,7 @@ struct sh_dstreg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_dstreg_type( struct sh_dstreg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
@@ -169,7 +169,7 @@ struct sh_srcreg
    unsigned is_reg:1;
 };
 
-static INLINE unsigned
+static inline unsigned
 sh_srcreg_type( struct sh_srcreg reg )
 {
    return reg.type_lo | (reg.type_hi << 3);
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 0013c963e7a..7f6d0645112 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -49,13 +49,13 @@ struct trace_query
 };
 
 
-static INLINE struct trace_query *
+static inline struct trace_query *
 trace_query(struct pipe_query *query) {
    return (struct trace_query *)query;
 }
 
 
-static INLINE struct pipe_query *
+static inline struct pipe_query *
 trace_query_unwrap(struct pipe_query *query)
 {
    if (query) {
@@ -66,7 +66,7 @@ trace_query_unwrap(struct pipe_query *query)
 }
 
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 trace_resource_unwrap(struct trace_context *tr_ctx,
                      struct pipe_resource *resource)
 {
@@ -82,7 +82,7 @@ trace_resource_unwrap(struct trace_context *tr_ctx,
 }
 
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 trace_surface_unwrap(struct trace_context *tr_ctx,
                      struct pipe_surface *surface)
 {
@@ -105,7 +105,7 @@ trace_surface_unwrap(struct trace_context *tr_ctx,
 }
 
 
-static INLINE void
+static inline void
 trace_context_draw_vbo(struct pipe_context *_pipe,
                        const struct pipe_draw_info *info)
 {
@@ -125,7 +125,7 @@ trace_context_draw_vbo(struct pipe_context *_pipe,
 }
 
 
-static INLINE struct pipe_query *
+static inline struct pipe_query *
 trace_context_create_query(struct pipe_context *_pipe,
                            unsigned query_type,
                            unsigned index)
@@ -163,7 +163,7 @@ trace_context_create_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_destroy_query(struct pipe_context *_pipe,
                             struct pipe_query *_query)
 {
@@ -185,7 +185,7 @@ trace_context_destroy_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE boolean
+static inline boolean
 trace_context_begin_query(struct pipe_context *_pipe,
                           struct pipe_query *query)
 {
@@ -207,7 +207,7 @@ trace_context_begin_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_end_query(struct pipe_context *_pipe,
                         struct pipe_query *query)
 {
@@ -227,7 +227,7 @@ trace_context_end_query(struct pipe_context *_pipe,
 }
 
 
-static INLINE boolean
+static inline boolean
 trace_context_get_query_result(struct pipe_context *_pipe,
                                struct pipe_query *_query,
                                boolean wait,
@@ -262,7 +262,7 @@ trace_context_get_query_result(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_blend_state(struct pipe_context *_pipe,
                                  const struct pipe_blend_state *state)
 {
@@ -285,7 +285,7 @@ trace_context_create_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_blend_state(struct pipe_context *_pipe,
                                void *state)
 {
@@ -303,7 +303,7 @@ trace_context_bind_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_blend_state(struct pipe_context *_pipe,
                                  void *state)
 {
@@ -321,7 +321,7 @@ trace_context_delete_blend_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_sampler_state(struct pipe_context *_pipe,
                                    const struct pipe_sampler_state *state)
 {
@@ -344,7 +344,7 @@ trace_context_create_sampler_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_sampler_states(struct pipe_context *_pipe,
                                   unsigned shader,
                                   unsigned start,
@@ -371,7 +371,7 @@ trace_context_bind_sampler_states(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_sampler_state(struct pipe_context *_pipe,
                                    void *state)
 {
@@ -389,7 +389,7 @@ trace_context_delete_sampler_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_rasterizer_state(struct pipe_context *_pipe,
                                       const struct pipe_rasterizer_state *state)
 {
@@ -412,7 +412,7 @@ trace_context_create_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_rasterizer_state(struct pipe_context *_pipe,
                                     void *state)
 {
@@ -430,7 +430,7 @@ trace_context_bind_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_rasterizer_state(struct pipe_context *_pipe,
                                       void *state)
 {
@@ -448,7 +448,7 @@ trace_context_delete_rasterizer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void *
+static inline void *
 trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                                const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -471,7 +471,7 @@ trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                              void *state)
 {
@@ -489,7 +489,7 @@ trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
                                                void *state)
 {
@@ -508,7 +508,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
 
 
 #define TRACE_SHADER_STATE(shader_type) \
-   static INLINE void * \
+   static inline void * \
    trace_context_create_##shader_type##_state(struct pipe_context *_pipe, \
                                  const struct pipe_shader_state *state) \
    { \
@@ -524,7 +524,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
       return result; \
    } \
     \
-   static INLINE void \
+   static inline void \
    trace_context_bind_##shader_type##_state(struct pipe_context *_pipe, \
                                void *state) \
    { \
@@ -537,7 +537,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
       trace_dump_call_end(); \
    } \
     \
-   static INLINE void \
+   static inline void \
    trace_context_delete_##shader_type##_state(struct pipe_context *_pipe, \
                                  void *state) \
    { \
@@ -559,7 +559,7 @@ TRACE_SHADER_STATE(tes)
 #undef TRACE_SHADER_STATE
 
 
-static INLINE void *
+static inline void *
 trace_context_create_vertex_elements_state(struct pipe_context *_pipe,
                                            unsigned num_elements,
                                            const struct  pipe_vertex_element *elements)
@@ -587,7 +587,7 @@ trace_context_create_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_bind_vertex_elements_state(struct pipe_context *_pipe,
                                          void *state)
 {
@@ -605,7 +605,7 @@ trace_context_bind_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_delete_vertex_elements_state(struct pipe_context *_pipe,
                                            void *state)
 {
@@ -623,7 +623,7 @@ trace_context_delete_vertex_elements_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_blend_color(struct pipe_context *_pipe,
                               const struct pipe_blend_color *state)
 {
@@ -641,7 +641,7 @@ trace_context_set_blend_color(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_stencil_ref(struct pipe_context *_pipe,
                               const struct pipe_stencil_ref *state)
 {
@@ -659,7 +659,7 @@ trace_context_set_stencil_ref(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_clip_state(struct pipe_context *_pipe,
                              const struct pipe_clip_state *state)
 {
@@ -676,7 +676,7 @@ trace_context_set_clip_state(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_set_sample_mask(struct pipe_context *_pipe,
                               unsigned sample_mask)
 {
@@ -693,7 +693,7 @@ trace_context_set_sample_mask(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_set_constant_buffer(struct pipe_context *_pipe,
                                   uint shader, uint index,
                                   struct pipe_constant_buffer *constant_buffer)
@@ -721,7 +721,7 @@ trace_context_set_constant_buffer(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_framebuffer_state(struct pipe_context *_pipe,
                                     const struct pipe_framebuffer_state *state)
 {
@@ -751,7 +751,7 @@ trace_context_set_framebuffer_state(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_polygon_stipple(struct pipe_context *_pipe,
                                   const struct pipe_poly_stipple *state)
 {
@@ -769,7 +769,7 @@ trace_context_set_polygon_stipple(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_scissor_states(struct pipe_context *_pipe,
                                  unsigned start_slot,
                                  unsigned num_scissors,
@@ -791,7 +791,7 @@ trace_context_set_scissor_states(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_viewport_states(struct pipe_context *_pipe,
                                   unsigned start_slot,
                                   unsigned num_viewports,
@@ -938,7 +938,7 @@ trace_context_surface_destroy(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_sampler_views(struct pipe_context *_pipe,
                                 unsigned shader,
                                 unsigned start,
@@ -974,7 +974,7 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_vertex_buffers(struct pipe_context *_pipe,
                                  unsigned start_slot, unsigned num_buffers,
                                  const struct pipe_vertex_buffer *buffers)
@@ -1008,7 +1008,7 @@ trace_context_set_vertex_buffers(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_index_buffer(struct pipe_context *_pipe,
                                const struct pipe_index_buffer *ib)
 {
@@ -1033,7 +1033,7 @@ trace_context_set_index_buffer(struct pipe_context *_pipe,
 }
 
 
-static INLINE struct pipe_stream_output_target *
+static inline struct pipe_stream_output_target *
 trace_context_create_stream_output_target(struct pipe_context *_pipe,
                                           struct pipe_resource *res,
                                           unsigned buffer_offset,
@@ -1063,7 +1063,7 @@ trace_context_create_stream_output_target(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_stream_output_target_destroy(
    struct pipe_context *_pipe,
    struct pipe_stream_output_target *target)
@@ -1082,7 +1082,7 @@ trace_context_stream_output_target_destroy(
 }
 
 
-static INLINE void
+static inline void
 trace_context_set_stream_output_targets(struct pipe_context *_pipe,
                                         unsigned num_targets,
                                         struct pipe_stream_output_target **tgs,
@@ -1104,7 +1104,7 @@ trace_context_set_stream_output_targets(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_resource_copy_region(struct pipe_context *_pipe,
                                    struct pipe_resource *dst,
                                    unsigned dst_level,
@@ -1139,7 +1139,7 @@ trace_context_resource_copy_region(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_blit(struct pipe_context *_pipe,
                    const struct pipe_blit_info *_info)
 {
@@ -1181,7 +1181,7 @@ trace_context_flush_resource(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_clear(struct pipe_context *_pipe,
                     unsigned buffers,
                     const union pipe_color_union *color,
@@ -1210,7 +1210,7 @@ trace_context_clear(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_clear_render_target(struct pipe_context *_pipe,
                                   struct pipe_surface *dst,
                                   const union pipe_color_union *color,
@@ -1237,7 +1237,7 @@ trace_context_clear_render_target(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_clear_depth_stencil(struct pipe_context *_pipe,
                                   struct pipe_surface *dst,
                                   unsigned clear_flags,
@@ -1269,7 +1269,7 @@ trace_context_clear_depth_stencil(struct pipe_context *_pipe,
    trace_dump_call_end();
 }
 
-static INLINE void
+static inline void
 trace_context_flush(struct pipe_context *_pipe,
                     struct pipe_fence_handle **fence,
                     unsigned flags)
@@ -1291,7 +1291,7 @@ trace_context_flush(struct pipe_context *_pipe,
 }
 
 
-static INLINE void
+static inline void
 trace_context_destroy(struct pipe_context *_pipe)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
index 1e5ad88d034..ad57d9d5243 100644
--- a/src/gallium/drivers/trace/tr_context.h
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -54,7 +54,7 @@ void
 trace_context_check(const struct pipe_context *pipe);
 
 
-static INLINE struct trace_context *
+static inline struct trace_context *
 trace_context(struct pipe_context *pipe)
 {
    assert(pipe);
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index 753b92d8b54..601e2cbbec5 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -64,7 +64,7 @@ static long unsigned call_no = 0;
 static boolean dumping = FALSE;
 
 
-static INLINE void
+static inline void
 trace_dump_write(const char *buf, size_t size)
 {
    if (stream) {
@@ -73,14 +73,14 @@ trace_dump_write(const char *buf, size_t size)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_writes(const char *s)
 {
    trace_dump_write(s, strlen(s));
 }
 
 
-static INLINE void
+static inline void
 trace_dump_writef(const char *format, ...)
 {
    static char buf[1024];
@@ -93,7 +93,7 @@ trace_dump_writef(const char *format, ...)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_escape(const char *str)
 {
    const unsigned char *p = (const unsigned char *)str;
@@ -117,7 +117,7 @@ trace_dump_escape(const char *str)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_indent(unsigned level)
 {
    unsigned i;
@@ -126,14 +126,14 @@ trace_dump_indent(unsigned level)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_newline(void)
 {
    trace_dump_writes("\n");
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag(const char *name)
 {
    trace_dump_writes("<");
@@ -142,7 +142,7 @@ trace_dump_tag(const char *name)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin(const char *name)
 {
    trace_dump_writes("<");
@@ -150,7 +150,7 @@ trace_dump_tag_begin(const char *name)
    trace_dump_writes(">");
 }
 
-static INLINE void
+static inline void
 trace_dump_tag_begin1(const char *name,
                       const char *attr1, const char *value1)
 {
@@ -164,7 +164,7 @@ trace_dump_tag_begin1(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin2(const char *name,
                       const char *attr1, const char *value1,
                       const char *attr2, const char *value2)
@@ -183,7 +183,7 @@ trace_dump_tag_begin2(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_begin3(const char *name,
                       const char *attr1, const char *value1,
                       const char *attr2, const char *value2,
@@ -207,7 +207,7 @@ trace_dump_tag_begin3(const char *name,
 }
 
 
-static INLINE void
+static inline void
 trace_dump_tag_end(const char *name)
 {
    trace_dump_writes("</");
diff --git a/src/gallium/drivers/trace/tr_dump_defines.h b/src/gallium/drivers/trace/tr_dump_defines.h
index 0c83c2b68f1..b38d63eac59 100644
--- a/src/gallium/drivers/trace/tr_dump_defines.h
+++ b/src/gallium/drivers/trace/tr_dump_defines.h
@@ -34,7 +34,7 @@
 #include "tr_dump.h"
 
 
-static INLINE void
+static inline void
 trace_dump_format(enum pipe_format format)
 {
    if (!trace_dumping_enabled_locked())
@@ -44,7 +44,7 @@ trace_dump_format(enum pipe_format format)
 }
 
 
-static INLINE void
+static inline void
 trace_dump_query_type(unsigned value)
 {
    if (!trace_dumping_enabled_locked())
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 266626defa8..1d86a378eea 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -369,29 +369,6 @@ trace_screen_fence_reference(struct pipe_screen *_screen,
 }
 
 
-static boolean
-trace_screen_fence_signalled(struct pipe_screen *_screen,
-                             struct pipe_fence_handle *fence)
-{
-   struct trace_screen *tr_scr = trace_screen(_screen);
-   struct pipe_screen *screen = tr_scr->screen;
-   int result;
-
-   trace_dump_call_begin("pipe_screen", "fence_signalled");
-
-   trace_dump_arg(ptr, screen);
-   trace_dump_arg(ptr, fence);
-
-   result = screen->fence_signalled(screen, fence);
-
-   trace_dump_ret(bool, result);
-
-   trace_dump_call_end();
-
-   return result;
-}
-
-
 static boolean
 trace_screen_fence_finish(struct pipe_screen *_screen,
                           struct pipe_fence_handle *fence,
@@ -503,7 +480,6 @@ trace_screen_create(struct pipe_screen *screen)
    tr_scr->base.resource_get_handle = trace_screen_resource_get_handle;
    tr_scr->base.resource_destroy = trace_screen_resource_destroy;
    tr_scr->base.fence_reference = trace_screen_fence_reference;
-   tr_scr->base.fence_signalled = trace_screen_fence_signalled;
    tr_scr->base.fence_finish = trace_screen_fence_finish;
    tr_scr->base.flush_frontbuffer = trace_screen_flush_frontbuffer;
    tr_scr->base.get_timestamp = trace_screen_get_timestamp;
diff --git a/src/gallium/drivers/trace/tr_texture.h b/src/gallium/drivers/trace/tr_texture.h
index 5e45c3c2f8f..e48b7b39e24 100644
--- a/src/gallium/drivers/trace/tr_texture.h
+++ b/src/gallium/drivers/trace/tr_texture.h
@@ -85,7 +85,7 @@ struct trace_transfer
 };
 
 
-static INLINE struct trace_resource *
+static inline struct trace_resource *
 trace_resource(struct pipe_resource *texture)
 {
    if(!texture)
@@ -95,7 +95,7 @@ trace_resource(struct pipe_resource *texture)
 }
 
 
-static INLINE struct trace_surface *
+static inline struct trace_surface *
 trace_surface(struct pipe_surface *surface)
 {
    if(!surface)
@@ -105,7 +105,7 @@ trace_surface(struct pipe_surface *surface)
 }
 
 
-static INLINE struct trace_sampler_view *
+static inline struct trace_sampler_view *
 trace_sampler_view(struct pipe_sampler_view *sampler_view)
 {
    if (!sampler_view)
@@ -114,7 +114,7 @@ trace_sampler_view(struct pipe_sampler_view *sampler_view)
 }
 
 
-static INLINE struct trace_transfer *
+static inline struct trace_transfer *
 trace_transfer(struct pipe_transfer *transfer)
 {
    if(!transfer)
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 3f62ce21a9f..f4a57ba3404 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
@@ -30,10 +28,10 @@ SIM_LDFLAGS = -lsimpenrose
 endif
 
 AM_CFLAGS = \
+	-I$(top_builddir)/src/glsl/nir \
 	$(LIBDRM_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \
-	-I$(top_srcdir)/src/mesa/ \
 	$()
 
 noinst_LTLIBRARIES = libvc4.la
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 1eb029e67e7..6fb40c20562 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -19,6 +19,8 @@ C_SOURCES := \
 	vc4_fence.c \
 	vc4_formats.c \
 	vc4_job.c \
+	vc4_nir_lower_blend.c \
+	vc4_nir_lower_io.c \
 	vc4_opt_algebraic.c \
 	vc4_opt_constant_folding.c \
 	vc4_opt_copy_propagation.c \
@@ -49,4 +51,5 @@ C_SOURCES := \
 	vc4_state.c \
 	vc4_tiling.c \
 	vc4_tiling.h \
+	vc4_uniforms.c \
 	$()
diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 1fd8aa9fb28..ffc973735ae 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -26,17 +26,6 @@
 
 #include "vc4_simulator_validate.h"
 
-enum vc4_bo_mode {
-	VC4_MODE_UNDECIDED,
-	VC4_MODE_RENDER,
-	VC4_MODE_SHADER,
-};
-
-struct vc4_bo_exec_state {
-	struct drm_gem_cma_object *bo;
-	enum vc4_bo_mode mode;
-};
-
 struct vc4_exec_info {
 	/* Sequence number for this bin/render job. */
 	uint64_t seqno;
@@ -47,7 +36,7 @@ struct vc4_exec_info {
 	/* This is the array of BOs that were looked up at the start of exec.
 	 * Command validation will use indices into this array.
 	 */
-	struct vc4_bo_exec_state *bo;
+	struct drm_gem_cma_object **bo;
 	uint32_t bo_count;
 
 	/* List of other BOs used in the job that need to be released
@@ -72,7 +61,6 @@ struct vc4_exec_info {
 	 * command lists.
 	 */
 	struct vc4_shader_state {
-		uint8_t packet;
 		uint32_t addr;
 		/* Maximum vertex index referenced by any primitive using this
 		 * shader state.
@@ -88,6 +76,7 @@ struct vc4_exec_info {
 	bool found_tile_binning_mode_config_packet;
 	bool found_start_tile_binning_packet;
 	bool found_increment_semaphore_packet;
+	bool found_flush;
 	uint8_t bin_tiles_x, bin_tiles_y;
 	struct drm_gem_cma_object *tile_bo;
 	uint32_t tile_alloc_offset;
@@ -99,6 +88,9 @@ struct vc4_exec_info {
 	uint32_t ct0ca, ct0ea;
 	uint32_t ct1ca, ct1ea;
 
+	/* Pointer to the unvalidated bin CL (if present). */
+	void *bin_u;
+
 	/* Pointers to the shader recs.  These paddr gets incremented as CL
 	 * packets are relocated in validate_gl_shader_state, and the vaddrs
 	 * (u and v) get incremented and size decremented as the shader recs
@@ -168,10 +160,8 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
 
-bool vc4_use_bo(struct vc4_exec_info *exec,
-		uint32_t hindex,
-		enum vc4_bo_mode mode,
-		struct drm_gem_cma_object **obj);
+struct drm_gem_cma_object *vc4_use_bo(struct vc4_exec_info *exec,
+				      uint32_t hindex);
 
 int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
 
diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c
index e4b7fea5968..93f9ec7ed9b 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_gem.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c
@@ -112,6 +112,8 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 
 	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
 
+	exec->bin_u = bin;
+
 	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
 	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
 	exec->shader_rec_size = args->shader_rec_size;
diff --git a/src/gallium/drivers/vc4/kernel/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h
index 88cfc0fa9f0..771e2b78761 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -88,16 +88,22 @@ enum vc4_packet {
 #define VC4_PACKET_START_TILE_BINNING_SIZE				1
 #define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE				1
 #define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE				1
+#define VC4_PACKET_BRANCH_SIZE						5
 #define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE				5
 #define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE				1
 #define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE			1
+#define VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE			5
+#define VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE			5
 #define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE			7
 #define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE			7
 #define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE				14
 #define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE				10
+#define VC4_PACKET_COMPRESSED_PRIMITIVE_SIZE				1
+#define VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE_SIZE			1
 #define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE				2
 #define VC4_PACKET_GL_SHADER_STATE_SIZE					5
 #define VC4_PACKET_NV_SHADER_STATE_SIZE					5
+#define VC4_PACKET_VG_SHADER_STATE_SIZE					5
 #define VC4_PACKET_CONFIGURATION_BITS_SIZE				4
 #define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE				5
 #define VC4_PACKET_POINT_SIZE_SIZE					5
@@ -106,6 +112,7 @@ enum vc4_packet {
 #define VC4_PACKET_DEPTH_OFFSET_SIZE					5
 #define VC4_PACKET_CLIP_WINDOW_SIZE					9
 #define VC4_PACKET_VIEWPORT_OFFSET_SIZE					5
+#define VC4_PACKET_Z_CLIPPING_SIZE					9
 #define VC4_PACKET_CLIPPER_XY_SCALING_SIZE				9
 #define VC4_PACKET_CLIPPER_Z_SCALING_SIZE				9
 #define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE			16
@@ -134,6 +141,16 @@ enum vc4_packet {
 #define VC4_TILING_FORMAT_LT        2
 /** @} */
 
+/** @{
+ *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     (1 << 3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       (1 << 2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              (1 << 1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           (1 << 0)
+
 /** @{
  *
  * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
index e2d907ad91f..b827eb7e9e1 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -100,7 +100,8 @@ static void emit_tile(struct vc4_exec_info *exec,
 		      struct vc4_rcl_setup *setup,
 		      uint8_t x, uint8_t y, bool first, bool last)
 {
-	bool has_bin = exec->args->bin_cl_size != 0;
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
 
 	/* Note that the load doesn't actually occur until the
 	 * tile coords packet is processed, and only one load
@@ -108,10 +109,9 @@ static void emit_tile(struct vc4_exec_info *exec,
 	 */
 	if (setup->color_read) {
 		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->color_read.bits);
+		rcl_u16(setup, args->color_read.bits);
 		rcl_u32(setup,
-			setup->color_read->paddr +
-			exec->args->color_read.offset);
+			setup->color_read->paddr + args->color_read.offset);
 	}
 
 	if (setup->zs_read) {
@@ -122,9 +122,8 @@ static void emit_tile(struct vc4_exec_info *exec,
 		}
 
 		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->zs_read.bits);
-		rcl_u32(setup,
-			setup->zs_read->paddr + exec->args->zs_read.offset);
+		rcl_u16(setup, args->zs_read.bits);
+		rcl_u32(setup, setup->zs_read->paddr + args->zs_read.offset);
 	}
 
 	/* Clipping depends on tile coordinates having been
@@ -147,11 +146,11 @@ static void emit_tile(struct vc4_exec_info *exec,
 
 	if (setup->zs_write) {
 		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-		rcl_u16(setup, exec->args->zs_write.bits |
+		rcl_u16(setup, args->zs_write.bits |
 			(setup->color_ms_write ?
 			 VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0));
 		rcl_u32(setup,
-			(setup->zs_write->paddr + exec->args->zs_write.offset) |
+			(setup->zs_write->paddr + args->zs_write.offset) |
 			((last && !setup->color_ms_write) ?
 			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
 	}
@@ -172,11 +171,12 @@ static void emit_tile(struct vc4_exec_info *exec,
 static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 			     struct vc4_rcl_setup *setup)
 {
-	bool has_bin = exec->args->bin_cl_size != 0;
-	uint8_t min_x_tile = exec->args->min_x_tile;
-	uint8_t min_y_tile = exec->args->min_y_tile;
-	uint8_t max_x_tile = exec->args->max_x_tile;
-	uint8_t max_y_tile = exec->args->max_y_tile;
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	uint8_t min_x_tile = args->min_x_tile;
+	uint8_t min_y_tile = args->min_y_tile;
+	uint8_t max_x_tile = args->max_x_tile;
+	uint8_t max_y_tile = args->max_y_tile;
 	uint8_t xtiles = max_x_tile - min_x_tile + 1;
 	uint8_t ytiles = max_y_tile - min_y_tile + 1;
 	uint8_t x, y;
@@ -185,7 +185,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE;
 	loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE;
 
-	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
 		size += VC4_PACKET_CLEAR_COLORS_SIZE +
 			VC4_PACKET_TILE_COORDINATES_SIZE +
 			VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
@@ -208,7 +208,7 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	}
 
 	if (setup->zs_write)
-		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
 	if (setup->color_ms_write) {
 		if (setup->zs_write)
 			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
@@ -226,23 +226,23 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	rcl_u32(setup,
 		(setup->color_ms_write ?
 		 (setup->color_ms_write->paddr +
-		  exec->args->color_ms_write.offset) :
+		  args->color_ms_write.offset) :
 		 0));
-	rcl_u16(setup, exec->args->width);
-	rcl_u16(setup, exec->args->height);
-	rcl_u16(setup, exec->args->color_ms_write.bits);
+	rcl_u16(setup, args->width);
+	rcl_u16(setup, args->height);
+	rcl_u16(setup, args->color_ms_write.bits);
 
 	/* The tile buffer gets cleared when the previous tile is stored.  If
 	 * the clear values changed between frames, then the tile buffer has
 	 * stale clear values in it, so we have to do a store in None mode (no
 	 * writes) so that we trigger the tile buffer clear.
 	 */
-	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
 		rcl_u8(setup, VC4_PACKET_CLEAR_COLORS);
-		rcl_u32(setup, exec->args->clear_color[0]);
-		rcl_u32(setup, exec->args->clear_color[1]);
-		rcl_u32(setup, exec->args->clear_z);
-		rcl_u8(setup, exec->args->clear_s);
+		rcl_u32(setup, args->clear_color[0]);
+		rcl_u32(setup, args->clear_color[1]);
+		rcl_u32(setup, args->clear_z);
+		rcl_u8(setup, args->clear_s);
 
 		vc4_tile_coordinates(setup, 0, 0);
 
@@ -286,7 +286,8 @@ static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+	*obj = vc4_use_bo(exec, surf->hindex);
+	if (!*obj)
 		return -EINVAL;
 
 	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
@@ -365,7 +366,8 @@ vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
 	if (surf->hindex == ~0)
 		return 0;
 
-	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+	*obj = vc4_use_bo(exec, surf->hindex);
+	if (!*obj)
 		return -EINVAL;
 
 	if (tiling > VC4_TILING_FORMAT_LT) {
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index a0b67a7e50b..b248831113c 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -94,42 +94,42 @@ size_is_lt(uint32_t width, uint32_t height, int cpp)
 		height <= 4 * utile_height(cpp));
 }
 
-bool
-vc4_use_bo(struct vc4_exec_info *exec,
-	   uint32_t hindex,
-	   enum vc4_bo_mode mode,
-	   struct drm_gem_cma_object **obj)
+struct drm_gem_cma_object *
+vc4_use_bo(struct vc4_exec_info *exec, uint32_t hindex)
 {
-	*obj = NULL;
+	struct drm_gem_cma_object *obj;
+	struct drm_vc4_bo *bo;
 
 	if (hindex >= exec->bo_count) {
 		DRM_ERROR("BO index %d greater than BO count %d\n",
 			  hindex, exec->bo_count);
-		return false;
+		return NULL;
+	}
+	obj = exec->bo[hindex];
+	bo = to_vc4_bo(&obj->base);
+
+	if (bo->validated_shader) {
+		DRM_ERROR("Trying to use shader BO as something other than "
+			  "a shader\n");
+		return NULL;
 	}
 
-	if (exec->bo[hindex].mode != mode) {
-		if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) {
-			exec->bo[hindex].mode = mode;
-		} else {
-			DRM_ERROR("BO index %d reused with mode %d vs %d\n",
-				  hindex, exec->bo[hindex].mode, mode);
-			return false;
-		}
-	}
+	return obj;
+}
 
-	*obj = exec->bo[hindex].bo;
-	return true;
+static struct drm_gem_cma_object *
+vc4_use_handle(struct vc4_exec_info *exec, uint32_t gem_handles_packet_index)
+{
+	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index]);
 }
 
 static bool
-vc4_use_handle(struct vc4_exec_info *exec,
-	       uint32_t gem_handles_packet_index,
-	       enum vc4_bo_mode mode,
-	       struct drm_gem_cma_object **obj)
+validate_bin_pos(struct vc4_exec_info *exec, void *untrusted, uint32_t pos)
 {
-	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index],
-			  mode, obj);
+	/* Note that the untrusted pointer passed to these functions is
+	 * incremented past the packet byte.
+	 */
+	return (untrusted - 1 == exec->bin_u + pos);
 }
 
 static uint32_t
@@ -201,14 +201,15 @@ vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
 	return true;
 }
 
+
 static int
-validate_flush_all(VALIDATE_ARGS)
+validate_flush(VALIDATE_ARGS)
 {
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("VC4_PACKET_FLUSH_ALL after "
-			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 1)) {
+		DRM_ERROR("Bin CL must end with VC4_PACKET_FLUSH\n");
 		return -EINVAL;
 	}
+	exec->found_flush = true;
 
 	return 0;
 }
@@ -233,17 +234,13 @@ validate_start_tile_binning(VALIDATE_ARGS)
 static int
 validate_increment_semaphore(VALIDATE_ARGS)
 {
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Duplicate VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	if (!validate_bin_pos(exec, untrusted, exec->args->bin_cl_size - 2)) {
+		DRM_ERROR("Bin CL must end with "
+			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
 		return -EINVAL;
 	}
 	exec->found_increment_semaphore_packet = true;
 
-	/* Once we've found the semaphore increment, there should be one FLUSH
-	 * then the end of the command list.  The FLUSH actually triggers the
-	 * increment, so we only need to make sure there
-	 */
-
 	return 0;
 }
 
@@ -257,11 +254,6 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1;
 	struct vc4_shader_state *shader_state;
 
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
 	/* Check overflow condition */
 	if (exec->shader_state_count == 0) {
 		DRM_ERROR("shader state must precede primitives\n");
@@ -272,7 +264,8 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 	if (max_index > shader_state->max_index)
 		shader_state->max_index = max_index;
 
-	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib))
+	ib = vc4_use_handle(exec, 0);
+	if (!ib)
 		return -EINVAL;
 
 	if (offset > ib->base.size ||
@@ -295,11 +288,6 @@ validate_gl_array_primitive(VALIDATE_ARGS)
 	uint32_t max_index;
 	struct vc4_shader_state *shader_state;
 
-	if (exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
 	/* Check overflow condition */
 	if (exec->shader_state_count == 0) {
 		DRM_ERROR("shader state must precede primitives\n");
@@ -329,7 +317,6 @@ validate_gl_shader_state(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE;
 	exec->shader_state[i].addr = *(uint32_t *)untrusted;
 	exec->shader_state[i].max_index = 0;
 
@@ -347,31 +334,6 @@ validate_gl_shader_state(VALIDATE_ARGS)
 	return 0;
 }
 
-static int
-validate_nv_shader_state(VALIDATE_ARGS)
-{
-	uint32_t i = exec->shader_state_count++;
-
-	if (i >= exec->shader_state_size) {
-		DRM_ERROR("More requests for shader states than declared\n");
-		return -EINVAL;
-	}
-
-	exec->shader_state[i].packet = VC4_PACKET_NV_SHADER_STATE;
-	exec->shader_state[i].addr = *(uint32_t *)untrusted;
-
-	if (exec->shader_state[i].addr & 15) {
-		DRM_ERROR("NV shader state address 0x%08x misaligned\n",
-			  exec->shader_state[i].addr);
-		return -EINVAL;
-	}
-
-	*(uint32_t *)validated = (exec->shader_state[i].addr +
-				  exec->shader_rec_p);
-
-	return 0;
-}
-
 static int
 validate_tile_binning_config(VALIDATE_ARGS)
 {
@@ -473,8 +435,8 @@ static const struct cmd_info {
 } cmd_info[] = {
 	VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL),
-	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", validate_flush),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning),
 	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore),
 
@@ -488,7 +450,7 @@ static const struct cmd_info {
 	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL),
 
 	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state),
-	VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state),
+	/* We don't support validating NV shader states. */
 
 	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL),
 	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL),
@@ -525,7 +487,7 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		u8 cmd = *(uint8_t *)src_pkt;
 		const struct cmd_info *info;
 
-		if (cmd > ARRAY_SIZE(cmd_info)) {
+		if (cmd >= ARRAY_SIZE(cmd_info)) {
 			DRM_ERROR("0x%08x: packet %d out of bounds\n",
 				  src_offset, cmd);
 			return -EINVAL;
@@ -580,8 +542,16 @@ vc4_validate_bin_cl(struct drm_device *dev,
 		return -EINVAL;
 	}
 
-	if (!exec->found_increment_semaphore_packet) {
-		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n");
+	/* The bin CL must be ended with INCREMENT_SEMAPHORE and FLUSH.  The
+	 * semaphore is used to trigger the render CL to start up, and the
+	 * FLUSH is what caps the bin lists with
+	 * VC4_PACKET_RETURN_FROM_SUB_LIST (so they jump back to the main
+	 * render CL when they get called to) and actually triggers the queued
+	 * semaphore increment.
+	 */
+	if (!exec->found_increment_semaphore_packet || !exec->found_flush) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE + "
+			  "VC4_PACKET_FLUSH\n");
 		return -EINVAL;
 	}
 
@@ -612,18 +582,19 @@ reloc_tex(struct vc4_exec_info *exec,
 	uint32_t cube_map_stride = 0;
 	enum vc4_texture_data_type type;
 
-	if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
+	tex = vc4_use_bo(exec, texture_handle_index);
+	if (!tex)
 		return false;
 
 	if (sample->is_direct) {
 		uint32_t remaining_size = tex->base.size - p0;
 		if (p0 > tex->base.size - 4) {
 			DRM_ERROR("UBO offset greater than UBO size\n");
-			return false;
+			goto fail;
 		}
 		if (p1 > remaining_size - 4) {
 			DRM_ERROR("UBO clamp would allow reads outside of UBO\n");
-			return false;
+			goto fail;
 		}
 		*validated_p0 = tex->paddr + p0;
 		return true;
@@ -642,14 +613,14 @@ reloc_tex(struct vc4_exec_info *exec,
 		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) {
 			if (cube_map_stride) {
 				DRM_ERROR("Cube map stride set twice\n");
-				return false;
+				goto fail;
 			}
 
 			cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK;
 		}
 		if (!cube_map_stride) {
 			DRM_ERROR("Cube map stride not set\n");
-			return false;
+			goto fail;
 		}
 	}
 
@@ -683,7 +654,7 @@ reloc_tex(struct vc4_exec_info *exec,
 	case VC4_TEXTURE_TYPE_YUV422R:
 	default:
 		DRM_ERROR("Texture format %d unsupported\n", type);
-		return false;
+		goto fail;
 	}
 	utile_w = utile_width(cpp);
 	utile_h = utile_height(cpp);
@@ -699,7 +670,7 @@ reloc_tex(struct vc4_exec_info *exec,
 
 	if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5,
 				tiling_format, width, height, cpp)) {
-		return false;
+		goto fail;
 	}
 
 	/* The mipmap levels are stored before the base of the texture.  Make
@@ -740,7 +711,7 @@ reloc_tex(struct vc4_exec_info *exec,
 				  i, level_width, level_height,
 				  aligned_width, aligned_height,
 				  level_size, offset);
-			return false;
+			goto fail;
 		}
 
 		offset -= level_size;
@@ -749,54 +720,37 @@ reloc_tex(struct vc4_exec_info *exec,
 	*validated_p0 = tex->paddr + p0;
 
 	return true;
+ fail:
+	DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0);
+	DRM_INFO("Texture p1 at %d: 0x%08x\n", sample->p_offset[1], p1);
+	DRM_INFO("Texture p2 at %d: 0x%08x\n", sample->p_offset[2], p2);
+	DRM_INFO("Texture p3 at %d: 0x%08x\n", sample->p_offset[3], p3);
+	return false;
 }
 
 static int
-validate_shader_rec(struct drm_device *dev,
-		    struct vc4_exec_info *exec,
-		    struct vc4_shader_state *state)
+validate_gl_shader_rec(struct drm_device *dev,
+		       struct vc4_exec_info *exec,
+		       struct vc4_shader_state *state)
 {
 	uint32_t *src_handles;
 	void *pkt_u, *pkt_v;
-	enum shader_rec_reloc_type {
-		RELOC_CODE,
-		RELOC_VBO,
+	static const uint32_t shader_reloc_offsets[] = {
+		4, /* fs */
+		16, /* vs */
+		28, /* cs */
 	};
-	struct shader_rec_reloc {
-		enum shader_rec_reloc_type type;
-		uint32_t offset;
-	};
-	static const struct shader_rec_reloc gl_relocs[] = {
-		{ RELOC_CODE, 4 },  /* fs */
-		{ RELOC_CODE, 16 }, /* vs */
-		{ RELOC_CODE, 28 }, /* cs */
-	};
-	static const struct shader_rec_reloc nv_relocs[] = {
-		{ RELOC_CODE, 4 }, /* fs */
-		{ RELOC_VBO, 12 }
-	};
-	const struct shader_rec_reloc *relocs;
-	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8];
-	uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size;
+	uint32_t shader_reloc_count = ARRAY_SIZE(shader_reloc_offsets);
+	struct drm_gem_cma_object *bo[shader_reloc_count + 8];
+	uint32_t nr_attributes, nr_relocs, packet_size;
 	int i;
-	struct vc4_validated_shader_info *validated_shader = NULL;
 
-	if (state->packet == VC4_PACKET_NV_SHADER_STATE) {
-		relocs = nv_relocs;
-		nr_fixed_relocs = ARRAY_SIZE(nv_relocs);
-
-		packet_size = 16;
-	} else {
-		relocs = gl_relocs;
-		nr_fixed_relocs = ARRAY_SIZE(gl_relocs);
-
-		nr_attributes = state->addr & 0x7;
-		if (nr_attributes == 0)
-			nr_attributes = 8;
-		packet_size = gl_shader_rec_size(state->addr);
-	}
-	nr_relocs = nr_fixed_relocs + nr_attributes;
+	nr_attributes = state->addr & 0x7;
+	if (nr_attributes == 0)
+		nr_attributes = 8;
+	packet_size = gl_shader_rec_size(state->addr);
 
+	nr_relocs = ARRAY_SIZE(shader_reloc_offsets) + nr_attributes;
 	if (nr_relocs * 4 > exec->shader_rec_size) {
 		DRM_ERROR("overflowed shader recs reading %d handles "
 			  "from %d bytes left\n",
@@ -826,21 +780,30 @@ validate_shader_rec(struct drm_device *dev,
 	exec->shader_rec_v += roundup(packet_size, 16);
 	exec->shader_rec_size -= packet_size;
 
-	for (i = 0; i < nr_relocs; i++) {
-		enum vc4_bo_mode mode;
-
-		if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE)
-			mode = VC4_MODE_SHADER;
-		else
-			mode = VC4_MODE_RENDER;
-
-		if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i])) {
-			return false;
-		}
+	if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
+		DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
+		return -EINVAL;
 	}
 
-	for (i = 0; i < nr_fixed_relocs; i++) {
-		uint32_t o = relocs[i].offset;
+	for (i = 0; i < shader_reloc_count; i++) {
+		if (src_handles[i] > exec->bo_count) {
+			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
+			return -EINVAL;
+		}
+
+		bo[i] = exec->bo[src_handles[i]];
+		if (!bo[i])
+			return -EINVAL;
+	}
+	for (i = shader_reloc_count; i < nr_relocs; i++) {
+		bo[i] = vc4_use_bo(exec, src_handles[i]);
+		if (!bo[i])
+			return -EINVAL;
+	}
+
+	for (i = 0; i < shader_reloc_count; i++) {
+		struct vc4_validated_shader_info *validated_shader;
+		uint32_t o = shader_reloc_offsets[i];
 		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
 		uint32_t *texture_handles_u;
 		void *uniform_data_u;
@@ -848,58 +811,50 @@ validate_shader_rec(struct drm_device *dev,
 
 		*(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
 
-		switch (relocs[i].type) {
-		case RELOC_CODE:
-			if (src_offset != 0) {
-				DRM_ERROR("Shaders must be at offset 0 of "
-					  "the BO.\n");
-				goto fail;
-			}
-
-			kfree(validated_shader);
-			validated_shader = vc4_validate_shader(bo[i]);
-			if (!validated_shader)
-				goto fail;
-
-			if (validated_shader->uniforms_src_size >
-			    exec->uniforms_size) {
-				DRM_ERROR("Uniforms src buffer overflow\n");
-				goto fail;
-			}
-
-			texture_handles_u = exec->uniforms_u;
-			uniform_data_u = (texture_handles_u +
-					  validated_shader->num_texture_samples);
-
-			memcpy(exec->uniforms_v, uniform_data_u,
-			       validated_shader->uniforms_size);
-
-			for (tex = 0;
-			     tex < validated_shader->num_texture_samples;
-			     tex++) {
-				if (!reloc_tex(exec,
-					       uniform_data_u,
-					       &validated_shader->texture_samples[tex],
-					       texture_handles_u[tex])) {
-					goto fail;
-				}
-			}
-
-			*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
-
-			exec->uniforms_u += validated_shader->uniforms_src_size;
-			exec->uniforms_v += validated_shader->uniforms_size;
-			exec->uniforms_p += validated_shader->uniforms_size;
-
-			break;
-
-		case RELOC_VBO:
-			break;
+		if (src_offset != 0) {
+			DRM_ERROR("Shaders must be at offset 0 of "
+				  "the BO.\n");
+			return -EINVAL;
 		}
+
+		validated_shader = to_vc4_bo(&bo[i]->base)->validated_shader;
+		if (!validated_shader)
+			return -EINVAL;
+
+		if (validated_shader->uniforms_src_size >
+		    exec->uniforms_size) {
+			DRM_ERROR("Uniforms src buffer overflow\n");
+			return -EINVAL;
+		}
+
+		texture_handles_u = exec->uniforms_u;
+		uniform_data_u = (texture_handles_u +
+				  validated_shader->num_texture_samples);
+
+		memcpy(exec->uniforms_v, uniform_data_u,
+		       validated_shader->uniforms_size);
+
+		for (tex = 0;
+		     tex < validated_shader->num_texture_samples;
+		     tex++) {
+			if (!reloc_tex(exec,
+				       uniform_data_u,
+				       &validated_shader->texture_samples[tex],
+				       texture_handles_u[tex])) {
+				return -EINVAL;
+			}
+		}
+
+		*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
+
+		exec->uniforms_u += validated_shader->uniforms_src_size;
+		exec->uniforms_v += validated_shader->uniforms_size;
+		exec->uniforms_p += validated_shader->uniforms_size;
 	}
 
 	for (i = 0; i < nr_attributes; i++) {
-		struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i];
+		struct drm_gem_cma_object *vbo =
+			bo[ARRAY_SIZE(shader_reloc_offsets) + i];
 		uint32_t o = 36 + i * 8;
 		uint32_t offset = *(uint32_t *)(pkt_u + o + 0);
 		uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1;
@@ -929,13 +884,7 @@ validate_shader_rec(struct drm_device *dev,
 		*(uint32_t *)(pkt_v + o) = vbo->paddr + offset;
 	}
 
-	kfree(validated_shader);
-
 	return 0;
-
-fail:
-	kfree(validated_shader);
-	return -EINVAL;
 }
 
 int
@@ -946,7 +895,7 @@ vc4_validate_shader_recs(struct drm_device *dev,
 	int ret = 0;
 
 	for (i = 0; i < exec->shader_state_count; i++) {
-		ret = validate_shader_rec(dev, exec, &exec->shader_state[i]);
+		ret = validate_gl_shader_rec(dev, exec, &exec->shader_state[i]);
 		if (ret)
 			return ret;
 	}
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index d29e2c9c318..e52a1941730 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -94,7 +94,7 @@ vc4_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
         struct vc4_context *vc4 = vc4_context(ctx);
 
         if (!util_blitter_is_blit_supported(vc4->blitter, info)) {
-                fprintf(stderr, "blit unsupported %s -> %s",
+                fprintf(stderr, "blit unsupported %s -> %s\n",
                     util_format_short_name(info->src.resource->format),
                     util_format_short_name(info->dst.resource->format));
                 return false;
@@ -135,7 +135,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
             info.dst.resource->nr_samples <= 1 &&
             !util_format_is_depth_or_stencil(info.src.resource->format) &&
             !util_format_is_pure_integer(info.src.resource->format)) {
-                fprintf(stderr, "color resolve unimplemented");
+                fprintf(stderr, "color resolve unimplemented\n");
                 return;
         }
 
@@ -147,7 +147,7 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
         }
 
         if (info.mask & PIPE_MASK_S) {
-                fprintf(stderr, "cannot blit stencil, skipping");
+                fprintf(stderr, "cannot blit stencil, skipping\n");
                 info.mask &= ~PIPE_MASK_S;
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index cbdb9e89cf6..f7b41f5816d 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014 Broadcom
+ * Copyright © 2014-2015 Broadcom
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -94,7 +94,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
                  * allocate something new instead, since we assume that the
                  * user will proceed to CPU map it and fill it with stuff.
                  */
-                if (!vc4_bo_wait(bo, 0)) {
+                if (!vc4_bo_wait(bo, 0, NULL)) {
                         pipe_mutex_unlock(cache->lock);
                         return NULL;
                 }
@@ -381,15 +381,57 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
 }
 
 struct vc4_bo *
-vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, uint32_t size,
-                 const char *name)
+vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
 {
-        void *map;
         struct vc4_bo *bo;
+        int ret;
+
+        bo = CALLOC_STRUCT(vc4_bo);
+        if (!bo)
+                return NULL;
+
+        pipe_reference_init(&bo->reference, 1);
+        bo->screen = screen;
+        bo->size = align(size, 4096);
+        bo->name = "code";
+        bo->private = false; /* Make sure it doesn't go back to the cache. */
+
+        if (!using_vc4_simulator) {
+                struct drm_vc4_create_shader_bo create = {
+                        .size = size,
+                        .data = (uintptr_t)data,
+                };
+
+                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO,
+                               &create);
+                bo->handle = create.handle;
+        } else {
+                struct drm_mode_create_dumb create;
+                memset(&create, 0, sizeof(create));
+
+                create.width = 128;
+                create.bpp = 8;
+                create.height = (size + 127) / 128;
+
+                ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+                bo->handle = create.handle;
+                assert(create.size >= size);
+
+                vc4_bo_map(bo);
+                memcpy(bo->map, data, size);
+        }
+        if (ret != 0) {
+                fprintf(stderr, "create shader ioctl failure\n");
+                abort();
+        }
+
+        screen->bo_count++;
+        screen->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Allocated shader %dkb:\n", size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
 
-        bo = vc4_bo_alloc(screen, size, name);
-        map = vc4_bo_map(bo);
-        memcpy(map, data, size);
         return bo;
 }
 
@@ -413,63 +455,91 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
         return true;
 }
 
+static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns)
+{
+        if (using_vc4_simulator)
+                return 0;
+
+        struct drm_vc4_wait_seqno wait = {
+                .seqno = seqno,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
+
+}
+
 bool
-vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns)
+vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason)
 {
         if (screen->finished_seqno >= seqno)
                 return true;
 
-        struct drm_vc4_wait_seqno wait;
-        memset(&wait, 0, sizeof(wait));
-        wait.seqno = seqno;
-        wait.timeout_ns = timeout_ns;
-
-        int ret;
-        if (!using_vc4_simulator)
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
-        else {
-                wait.seqno = screen->finished_seqno;
-                ret = 0;
+        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc4_wait_seqno_ioctl(screen->fd, seqno, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on seqno %lld for %s\n",
+                                (long long)seqno, reason);
+                }
         }
 
-        if (ret == 0) {
-                screen->finished_seqno = wait.seqno;
-                return true;
+        int ret = vc4_wait_seqno_ioctl(screen->fd, seqno, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
+
+                return false;
         }
 
-        if (errno != ETIME) {
-                fprintf(stderr, "wait failed: %d\n", ret);
-                abort();
-        }
+        screen->finished_seqno = seqno;
+        return true;
+}
+
+static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
+{
+        if (using_vc4_simulator)
+                return 0;
+
+        struct drm_vc4_wait_bo wait = {
+                .handle = handle,
+                .timeout_ns = timeout_ns,
+        };
+        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
+        if (ret == -1)
+                return -errno;
+        else
+                return 0;
 
-        return false;
 }
 
 bool
-vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns)
+vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason)
 {
         struct vc4_screen *screen = bo->screen;
 
-        struct drm_vc4_wait_bo wait;
-        memset(&wait, 0, sizeof(wait));
-        wait.handle = bo->handle;
-        wait.timeout_ns = timeout_ns;
-
-        int ret;
-        if (!using_vc4_simulator)
-                ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
-        else
-                ret = 0;
-
-        if (ret == 0)
-                return true;
-
-        if (errno != ETIME) {
-                fprintf(stderr, "wait failed: %d\n", ret);
-                abort();
+        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
+                if (vc4_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) {
+                        fprintf(stderr, "Blocking on %s BO for %s\n",
+                                bo->name, reason);
+                }
         }
 
-        return false;
+        int ret = vc4_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns);
+        if (ret) {
+                if (ret != -ETIME) {
+                        fprintf(stderr, "wait failed: %d\n", ret);
+                        abort();
+                }
+
+                return false;
+        }
+
+        return true;
 }
 
 void *
@@ -515,7 +585,7 @@ vc4_bo_map(struct vc4_bo *bo)
 {
         void *map = vc4_bo_map_unsynchronized(bo);
 
-        bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE);
+        bool ok = vc4_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map");
         if (!ok) {
                 fprintf(stderr, "BO wait for map failed\n");
                 abort();
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h
index 7320695ca8e..b77506e242a 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -58,8 +58,8 @@ struct vc4_bo {
 
 struct vc4_bo *vc4_bo_alloc(struct vc4_screen *screen, uint32_t size,
                             const char *name);
-struct vc4_bo *vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data,
-                                uint32_t size, const char *name);
+struct vc4_bo *vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data,
+                                   uint32_t size);
 void vc4_bo_last_unreference(struct vc4_bo *bo);
 void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time);
 struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
@@ -113,10 +113,11 @@ void *
 vc4_bo_map_unsynchronized(struct vc4_bo *bo);
 
 bool
-vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns);
+vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns, const char *reason);
 
 bool
-vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns);
+vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+               const char *reason);
 
 void
 vc4_bufmgr_destroy(struct pipe_screen *pscreen);
diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c
index 0700e885cbf..ced4f2dfa86 100644
--- a/src/gallium/drivers/vc4/vc4_cl.c
+++ b/src/gallium/drivers/vc4/vc4_cl.c
@@ -36,11 +36,12 @@ vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl)
 void
 cl_ensure_space(struct vc4_cl *cl, uint32_t space)
 {
-        if ((cl->next - cl->base) + space <= cl->size)
+        uint32_t offset = cl_offset(cl);
+
+        if (offset + space <= cl->size)
                 return;
 
         uint32_t size = MAX2(cl->size + space, cl->size * 2);
-        uint32_t offset = cl->next -cl->base;
 
         cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size);
         cl->size = size;
@@ -60,15 +61,20 @@ vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo)
         uint32_t hindex;
         uint32_t *current_handles = vc4->bo_handles.base;
 
-        for (hindex = 0;
-             hindex < (vc4->bo_handles.next - vc4->bo_handles.base) / 4;
-             hindex++) {
+        for (hindex = 0; hindex < cl_offset(&vc4->bo_handles) / 4; hindex++) {
                 if (current_handles[hindex] == bo->handle)
                         return hindex;
         }
 
-        cl_u32(&vc4->bo_handles, bo->handle);
-        cl_ptr(&vc4->bo_pointers, vc4_bo_reference(bo));
+        struct vc4_cl_out *out;
+
+        out = cl_start(&vc4->bo_handles);
+        cl_u32(&out, bo->handle);
+        cl_end(&vc4->bo_handles, out);
+
+        out = cl_start(&vc4->bo_pointers);
+        cl_ptr(&out, vc4_bo_reference(bo));
+        cl_end(&vc4->bo_pointers, out);
 
         return hindex;
 }
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 4a50e790942..bf4be0efc29 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -33,12 +33,20 @@
 
 struct vc4_bo;
 
+/**
+ * Undefined structure, used for typechecking that you're passing the pointers
+ * to these functions correctly.
+ */
+struct vc4_cl_out;
+
 struct vc4_cl {
         void *base;
-        void *next;
+        struct vc4_cl_out *next;
+        struct vc4_cl_out *reloc_next;
         uint32_t size;
-        uint32_t reloc_next;
+#ifdef DEBUG
         uint32_t reloc_count;
+#endif
 };
 
 void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl);
@@ -49,135 +57,149 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo);
 struct PACKED unaligned_16 { uint16_t x; };
 struct PACKED unaligned_32 { uint32_t x; };
 
-static inline void
-put_unaligned_32(void *ptr, uint32_t val)
+static inline uint32_t cl_offset(struct vc4_cl *cl)
 {
-        struct unaligned_32 *p = ptr;
+        return (char *)cl->next - (char *)cl->base;
+}
+
+static inline void
+cl_advance(struct vc4_cl_out **cl, uint32_t n)
+{
+        (*cl) = (struct vc4_cl_out *)((char *)(*cl) + n);
+}
+
+static inline struct vc4_cl_out *
+cl_start(struct vc4_cl *cl)
+{
+        return cl->next;
+}
+
+static inline void
+cl_end(struct vc4_cl *cl, struct vc4_cl_out *next)
+{
+        cl->next = next;
+        assert(cl_offset(cl) <= cl->size);
+}
+
+
+static inline void
+put_unaligned_32(struct vc4_cl_out *ptr, uint32_t val)
+{
+        struct unaligned_32 *p = (void *)ptr;
         p->x = val;
 }
 
 static inline void
-put_unaligned_16(void *ptr, uint16_t val)
+put_unaligned_16(struct vc4_cl_out *ptr, uint16_t val)
 {
-        struct unaligned_16 *p = ptr;
+        struct unaligned_16 *p = (void *)ptr;
         p->x = val;
 }
 
 static inline void
-cl_u8(struct vc4_cl *cl, uint8_t n)
+cl_u8(struct vc4_cl_out **cl, uint8_t n)
 {
-        assert((cl->next - cl->base) + 1 <= cl->size);
-
-        *(uint8_t *)cl->next = n;
-        cl->next++;
+        *(uint8_t *)(*cl) = n;
+        cl_advance(cl, 1);
 }
 
 static inline void
-cl_u16(struct vc4_cl *cl, uint16_t n)
+cl_u16(struct vc4_cl_out **cl, uint16_t n)
 {
-        assert((cl->next - cl->base) + 2 <= cl->size);
-
-        put_unaligned_16(cl->next, n);
-        cl->next += 2;
+        put_unaligned_16(*cl, n);
+        cl_advance(cl, 2);
 }
 
 static inline void
-cl_u32(struct vc4_cl *cl, uint32_t n)
+cl_u32(struct vc4_cl_out **cl, uint32_t n)
 {
-        assert((cl->next - cl->base) + 4 <= cl->size);
-
-        put_unaligned_32(cl->next, n);
-        cl->next += 4;
+        put_unaligned_32(*cl, n);
+        cl_advance(cl, 4);
 }
 
 static inline void
-cl_aligned_u32(struct vc4_cl *cl, uint32_t n)
+cl_aligned_u32(struct vc4_cl_out **cl, uint32_t n)
 {
-        assert((cl->next - cl->base) + 4 <= cl->size);
-
-        *(uint32_t *)cl->next = n;
-        cl->next += 4;
+        *(uint32_t *)(*cl) = n;
+        cl_advance(cl, 4);
 }
 
 static inline void
-cl_ptr(struct vc4_cl *cl, void *ptr)
+cl_ptr(struct vc4_cl_out **cl, void *ptr)
 {
-        assert((cl->next - cl->base) + sizeof(void *) <= cl->size);
-
-        *(void **)cl->next = ptr;
-        cl->next += sizeof(void *);
+        *(struct vc4_cl_out **)(*cl) = ptr;
+        cl_advance(cl, sizeof(void *));
 }
 
 static inline void
-cl_f(struct vc4_cl *cl, float f)
+cl_f(struct vc4_cl_out **cl, float f)
 {
         cl_u32(cl, fui(f));
 }
 
 static inline void
-cl_aligned_f(struct vc4_cl *cl, float f)
+cl_aligned_f(struct vc4_cl_out **cl, float f)
 {
         cl_aligned_u32(cl, fui(f));
 }
 
 static inline void
-cl_start_reloc(struct vc4_cl *cl, uint32_t n)
+cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
 {
         assert(n == 1 || n == 2);
+#ifdef DEBUG
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
+#endif
 
-        cl_u8(cl, VC4_PACKET_GEM_HANDLES);
-        cl->reloc_next = cl->next - cl->base;
-        cl_u32(cl, 0); /* Space where hindex will be written. */
-        cl_u32(cl, 0); /* Space where hindex will be written. */
+        cl_u8(out, VC4_PACKET_GEM_HANDLES);
+        cl->reloc_next = *out;
+        cl_u32(out, 0); /* Space where hindex will be written. */
+        cl_u32(out, 0); /* Space where hindex will be written. */
 }
 
-static inline void
+static inline struct vc4_cl_out *
 cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
+#ifdef DEBUG
         assert(cl->reloc_count == 0);
         cl->reloc_count = n;
-        cl->reloc_next = cl->next - cl->base;
+#endif
+        cl->reloc_next = cl->next;
 
-        /* Space where hindex will be written. */
-        cl->next += n * 4;
+        /* Reserve the space where hindex will be written. */
+        cl_advance(&cl->next, n * 4);
+
+        return cl->next;
 }
 
 static inline void
-cl_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
-{
-        *(uint32_t *)(cl->base + cl->reloc_next) = hindex;
-        cl->reloc_next += 4;
-
-        cl->reloc_count--;
-
-        cl_u32(cl, offset);
-}
-
-static inline void
-cl_aligned_reloc_hindex(struct vc4_cl *cl, uint32_t hindex, uint32_t offset)
-{
-        *(uint32_t *)(cl->base + cl->reloc_next) = hindex;
-        cl->reloc_next += 4;
-
-        cl->reloc_count--;
-
-        cl_aligned_u32(cl, offset);
-}
-
-static inline void
-cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
+cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl, struct vc4_cl_out **cl_out,
          struct vc4_bo *bo, uint32_t offset)
 {
-        cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
+        cl_advance(&cl->reloc_next, 4);
+
+#ifdef DEBUG
+        cl->reloc_count--;
+#endif
+
+        cl_u32(cl_out, offset);
 }
 
 static inline void
 cl_aligned_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
-         struct vc4_bo *bo, uint32_t offset)
+                 struct vc4_cl_out **cl_out,
+                 struct vc4_bo *bo, uint32_t offset)
 {
-        cl_aligned_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
+        *(uint32_t *)cl->reloc_next = vc4_gem_hindex(vc4, bo);
+        cl_advance(&cl->reloc_next, 4);
+
+#ifdef DEBUG
+        cl->reloc_count--;
+#endif
+
+        cl_aligned_u32(cl_out, offset);
 }
 
 void cl_ensure_space(struct vc4_cl *cl, uint32_t size);
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 69055081daa..6d748010baf 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -34,7 +34,7 @@ dump_float(void *cl, uint32_t offset, uint32_t hw_offset)
         void *f = cl + offset;
 
         fprintf(stderr, "0x%08x 0x%08x:      %f (0x%08x)\n",
-                offset, hw_offset, *(float *)f, *(uint32_t *)f);
+                offset, hw_offset, uif(*(uint32_t *)f), *(uint32_t *)f);
 }
 
 static void
@@ -47,7 +47,33 @@ dump_VC4_PACKET_BRANCH_TO_SUB_LIST(void *cl, uint32_t offset, uint32_t hw_offset
 }
 
 static void
-dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+dump_loadstore_full(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint32_t bits = *(uint32_t *)(cl + offset);
+
+        fprintf(stderr, "0x%08x 0x%08x:      addr 0x%08x%s%s%s%s\n",
+                offset, hw_offset,
+                bits & ~0xf,
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL) ? "" : " clear",
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_ZS) ? "" : " zs",
+                (bits & VC4_LOADSTORE_FULL_RES_DISABLE_COLOR) ? "" : " color",
+                (bits & VC4_LOADSTORE_FULL_RES_EOF) ? " eof" : "");
+}
+
+static void
+dump_VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_full(cl, offset, hw_offset);
+}
+
+static void
+dump_VC4_PACKET_STORE_FULL_RES_TILE_BUFFER(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_full(cl, offset, hw_offset);
+}
+
+static void
+dump_loadstore_general(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint8_t *bytes = cl + offset;
         uint32_t *addr = cl + offset + 2;
@@ -124,6 +150,18 @@ dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw
                 (*addr & (1 << 3)) ? " EOF" : "");
 }
 
+static void
+dump_VC4_PACKET_STORE_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_general(cl, offset, hw_offset);
+}
+
+static void
+dump_VC4_PACKET_LOAD_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        dump_loadstore_general(cl, offset, hw_offset);
+}
+
 static void
 dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset)
 {
@@ -291,63 +329,63 @@ dump_VC4_PACKET_GEM_HANDLES(void *cl, uint32_t offset, uint32_t hw_offset)
                 offset, hw_offset, handles[0], handles[1]);
 }
 
-#define PACKET_DUMP(name, size) [name] = { #name, size, dump_##name }
-#define PACKET(name, size) [name] = { #name, size, NULL }
+#define PACKET_DUMP(name) [name] = { #name, name ## _SIZE, dump_##name }
+#define PACKET(name) [name] = { #name, name ## _SIZE, NULL }
 
 static const struct packet_info {
         const char *name;
         uint8_t size;
         void (*dump_func)(void *cl, uint32_t offset, uint32_t hw_offset);
 } packet_info[] = {
-        PACKET(VC4_PACKET_HALT, 1),
-        PACKET(VC4_PACKET_NOP, 1),
+        PACKET(VC4_PACKET_HALT),
+        PACKET(VC4_PACKET_NOP),
 
-        PACKET(VC4_PACKET_FLUSH, 1),
-        PACKET(VC4_PACKET_FLUSH_ALL, 1),
-        PACKET(VC4_PACKET_START_TILE_BINNING, 1),
-        PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, 1),
-        PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE, 1),
+        PACKET(VC4_PACKET_FLUSH),
+        PACKET(VC4_PACKET_FLUSH_ALL),
+        PACKET(VC4_PACKET_START_TILE_BINNING),
+        PACKET(VC4_PACKET_INCREMENT_SEMAPHORE),
+        PACKET(VC4_PACKET_WAIT_ON_SEMAPHORE),
 
-        PACKET(VC4_PACKET_BRANCH, 5),
-        PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST, 5),
+        PACKET(VC4_PACKET_BRANCH),
+        PACKET_DUMP(VC4_PACKET_BRANCH_TO_SUB_LIST),
 
-        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER, 1),
-        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF, 1),
-        PACKET(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER, 5),
-        PACKET(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER, 5),
-        PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL, 7),
-        PACKET(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL, 7),
+        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER),
+        PACKET(VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF),
+        PACKET_DUMP(VC4_PACKET_STORE_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER),
+        PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL),
+        PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
 
-        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, 14),
-        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, 10),
+        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE),
+        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE),
 
-        PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE, 48),
-        PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE, 49),
+        PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE),
+        PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE),
 
-        PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, 2),
+        PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT),
 
-        PACKET(VC4_PACKET_GL_SHADER_STATE, 5),
-        PACKET(VC4_PACKET_NV_SHADER_STATE, 5),
-        PACKET(VC4_PACKET_VG_SHADER_STATE, 5),
+        PACKET(VC4_PACKET_GL_SHADER_STATE),
+        PACKET(VC4_PACKET_NV_SHADER_STATE),
+        PACKET(VC4_PACKET_VG_SHADER_STATE),
 
-        PACKET(VC4_PACKET_CONFIGURATION_BITS, 4),
-        PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS, 5),
-        PACKET_DUMP(VC4_PACKET_POINT_SIZE, 5),
-        PACKET_DUMP(VC4_PACKET_LINE_WIDTH, 5),
-        PACKET(VC4_PACKET_RHT_X_BOUNDARY, 3),
-        PACKET(VC4_PACKET_DEPTH_OFFSET, 5),
-        PACKET(VC4_PACKET_CLIP_WINDOW, 9),
-        PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET, 5),
-        PACKET(VC4_PACKET_Z_CLIPPING, 9),
-        PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9),
-        PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9),
+        PACKET(VC4_PACKET_CONFIGURATION_BITS),
+        PACKET_DUMP(VC4_PACKET_FLAT_SHADE_FLAGS),
+        PACKET_DUMP(VC4_PACKET_POINT_SIZE),
+        PACKET_DUMP(VC4_PACKET_LINE_WIDTH),
+        PACKET(VC4_PACKET_RHT_X_BOUNDARY),
+        PACKET(VC4_PACKET_DEPTH_OFFSET),
+        PACKET(VC4_PACKET_CLIP_WINDOW),
+        PACKET_DUMP(VC4_PACKET_VIEWPORT_OFFSET),
+        PACKET(VC4_PACKET_Z_CLIPPING),
+        PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING),
+        PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING),
 
-        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16),
-        PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11),
-        PACKET(VC4_PACKET_CLEAR_COLORS, 14),
-        PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3),
+        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG),
+        PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG),
+        PACKET(VC4_PACKET_CLEAR_COLORS),
+        PACKET_DUMP(VC4_PACKET_TILE_COORDINATES),
 
-        PACKET_DUMP(VC4_PACKET_GEM_HANDLES, 9),
+        PACKET_DUMP(VC4_PACKET_GEM_HANDLES),
 };
 
 void
@@ -359,7 +397,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render)
         while (offset < size) {
                 uint8_t header = cmds[offset];
 
-                if (header > ARRAY_SIZE(packet_info) ||
+                if (header >= ARRAY_SIZE(packet_info) ||
                     !packet_info[header].name) {
                         fprintf(stderr, "0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n",
                                 offset, hw_offset, header, header);
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 630f8e68896..fff63158c9d 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -61,9 +61,11 @@ vc4_flush(struct pipe_context *pctx)
          * FLUSH completes.
          */
         cl_ensure_space(&vc4->bcl, 8);
-        cl_u8(&vc4->bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
+        cl_u8(&bcl, VC4_PACKET_INCREMENT_SEMAPHORE);
         /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */
-        cl_u8(&vc4->bcl, VC4_PACKET_FLUSH);
+        cl_u8(&bcl, VC4_PACKET_FLUSH);
+        cl_end(&vc4->bcl, bcl);
 
         if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
                 pipe_surface_reference(&vc4->color_write, cbuf);
@@ -103,8 +105,10 @@ vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
         vc4_flush(pctx);
 
         if (fence) {
+                struct pipe_screen *screen = pctx->screen;
                 struct vc4_fence *f = vc4_fence_create(vc4->screen,
                                                        vc4->last_emit_seqno);
+                screen->fence_reference(screen, fence, NULL);
                 *fence = (struct pipe_fence_handle *)f;
         }
 }
@@ -126,8 +130,7 @@ vc4_cl_references_bo(struct pipe_context *pctx, struct vc4_bo *bo)
          * they match.
          */
         struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
-        for (int i = 0; i < (vc4->bo_handles.next -
-                             vc4->bo_handles.base) / 4; i++) {
+        for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) {
                 if (referenced_bos[i] == bo) {
                         return true;
                 }
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index d5d6be16f6e..654c46f3c0d 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -67,7 +67,20 @@
 #define VC4_DIRTY_CLIP          (1 << 20)
 #define VC4_DIRTY_UNCOMPILED_VS (1 << 21)
 #define VC4_DIRTY_UNCOMPILED_FS (1 << 22)
-#define VC4_DIRTY_COMPILED_FS   (1 << 24)
+#define VC4_DIRTY_COMPILED_CS   (1 << 23)
+#define VC4_DIRTY_COMPILED_VS   (1 << 24)
+#define VC4_DIRTY_COMPILED_FS   (1 << 25)
+
+struct vc4_sampler_view {
+        struct pipe_sampler_view base;
+        uint32_t texture_p0;
+        uint32_t texture_p1;
+};
+
+struct vc4_sampler_state {
+        struct pipe_sampler_state base;
+        uint32_t texture_p1;
+};
 
 struct vc4_texture_stateobj {
         struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
@@ -121,6 +134,12 @@ struct vc4_compiled_shader {
         struct vc4_ubo_range *ubo_ranges;
         uint32_t num_ubo_ranges;
         uint32_t ubo_size;
+        /**
+         * VC4_DIRTY_* flags that, when set in vc4->dirty, mean that the
+         * uniforms have to be rewritten (and therefore the shader state
+         * reemitted).
+         */
+        uint32_t uniform_dirty_bits;
 
         /** bitmask of which inputs are color inputs, for flat shade handling. */
         uint32_t color_inputs;
@@ -238,6 +257,11 @@ struct vc4_context {
          */
         bool draw_call_queued;
 
+        /** Maximum index buffer valid for the current shader_rec. */
+        uint32_t max_index;
+        /** Last index bias baked into the current shader_rec. */
+        uint32_t last_index_bias;
+
         struct primconvert_context *primconvert;
 
         struct hash_table *fs_cache, *vs_cache;
@@ -246,6 +270,7 @@ struct vc4_context {
 
         struct ra_regs *regs;
         unsigned int reg_class_any;
+        unsigned int reg_class_r4_or_a;
         unsigned int reg_class_a;
 
         uint8_t prim_mode;
@@ -326,6 +351,18 @@ vc4_context(struct pipe_context *pcontext)
         return (struct vc4_context *)pcontext;
 }
 
+static inline struct vc4_sampler_view *
+vc4_sampler_view(struct pipe_sampler_view *psview)
+{
+        return (struct vc4_sampler_view *)psview;
+}
+
+static inline struct vc4_sampler_state *
+vc4_sampler_state(struct pipe_sampler_state *psampler)
+{
+        return (struct vc4_sampler_state *)psampler;
+}
+
 struct pipe_context *vc4_context_create(struct pipe_screen *pscreen,
                                         void *priv);
 void vc4_draw_init(struct pipe_context *pctx);
@@ -337,6 +374,7 @@ void vc4_simulator_init(struct vc4_screen *screen);
 int vc4_simulator_flush(struct vc4_context *vc4,
                         struct drm_vc4_submit_cl *args);
 
+void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader);
 void vc4_write_uniforms(struct vc4_context *vc4,
                         struct vc4_compiled_shader *shader,
                         struct vc4_constbuf_stateobj *cb,
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 5e6d70d6f33..a4e5e092b1a 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -71,37 +71,40 @@ vc4_start_draw(struct vc4_context *vc4)
         uint32_t height = vc4->framebuffer.height;
         uint32_t tilew = align(width, 64) / 64;
         uint32_t tileh = align(height, 64) / 64;
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
 
         //   Tile state data is 48 bytes per tile, I think it can be thrown away
         //   as soon as binning is finished.
-        cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
-        cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */
-        cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */
-        cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */
-        cl_u8(&vc4->bcl, tilew);
-        cl_u8(&vc4->bcl, tileh);
-        cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */
+        cl_u8(&bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
+        cl_u32(&bcl, 0); /* tile alloc addr, filled by kernel */
+        cl_u32(&bcl, 0); /* tile alloc size, filled by kernel */
+        cl_u32(&bcl, 0); /* tile state addr, filled by kernel */
+        cl_u8(&bcl, tilew);
+        cl_u8(&bcl, tileh);
+        cl_u8(&bcl, 0); /* flags, filled by kernel. */
 
         /* START_TILE_BINNING resets the statechange counters in the hardware,
          * which are what is used when a primitive is binned to a tile to
          * figure out what new state packets need to be written to that tile's
          * command list.
          */
-        cl_u8(&vc4->bcl, VC4_PACKET_START_TILE_BINNING);
+        cl_u8(&bcl, VC4_PACKET_START_TILE_BINNING);
 
         /* Reset the current compressed primitives format.  This gets modified
          * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
          * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
          * of every tile.
          */
-        cl_u8(&vc4->bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
-        cl_u8(&vc4->bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
-                          VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
+        cl_u8(&bcl, VC4_PACKET_PRIMITIVE_LIST_FORMAT);
+        cl_u8(&bcl, (VC4_PRIMITIVE_LIST_FORMAT_16_INDEX |
+                     VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
 
         vc4->needs_flush = true;
         vc4->draw_call_queued = true;
         vc4->draw_width = width;
         vc4->draw_height = height;
+
+        cl_end(&vc4->bcl, bcl);
 }
 
 static void
@@ -118,6 +121,111 @@ vc4_update_shadow_textures(struct pipe_context *pctx,
         }
 }
 
+static void
+vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info)
+{
+        /* VC4_DIRTY_VTXSTATE */
+        struct vc4_vertex_stateobj *vtx = vc4->vtx;
+        /* VC4_DIRTY_VTXBUF */
+        struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;
+
+        /* The simulator throws a fit if VS or CS don't read an attribute, so
+         * we emit a dummy read.
+         */
+        uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);
+        /* Emit the shader record. */
+        struct vc4_cl_out *shader_rec =
+                cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
+        /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
+        cl_u16(&shader_rec,
+               VC4_SHADER_FLAG_ENABLE_CLIPPING |
+               VC4_SHADER_FLAG_FS_SINGLE_THREAD |
+               ((info->mode == PIPE_PRIM_POINTS &&
+                 vc4->rasterizer->base.point_size_per_vertex) ?
+                VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
+
+        /* VC4_DIRTY_COMPILED_FS */
+        cl_u8(&shader_rec, 0); /* fs num uniforms (unused) */
+        cl_u8(&shader_rec, vc4->prog.fs->num_inputs);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.fs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+
+        /* VC4_DIRTY_COMPILED_VS */
+        cl_u16(&shader_rec, 0); /* vs num uniforms */
+        cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
+        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+
+        /* VC4_DIRTY_COMPILED_CS */
+        cl_u16(&shader_rec, 0); /* cs num uniforms */
+        cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
+        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
+        cl_reloc(vc4, &vc4->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
+        cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
+
+        uint32_t max_index = 0xffff;
+        for (int i = 0; i < vtx->num_elements; i++) {
+                struct pipe_vertex_element *elem = &vtx->pipe[i];
+                struct pipe_vertex_buffer *vb =
+                        &vertexbuf->vb[elem->vertex_buffer_index];
+                struct vc4_resource *rsc = vc4_resource(vb->buffer);
+                /* not vc4->dirty tracked: vc4->last_index_bias */
+                uint32_t offset = (vb->buffer_offset +
+                                   elem->src_offset +
+                                   vb->stride * info->index_bias);
+                uint32_t vb_size = rsc->bo->size - offset;
+                uint32_t elem_size =
+                        util_format_get_blocksize(elem->src_format);
+
+                cl_reloc(vc4, &vc4->shader_rec, &shader_rec, rsc->bo, offset);
+                cl_u8(&shader_rec, elem_size - 1);
+                cl_u8(&shader_rec, vb->stride);
+                cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[i]);
+                cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[i]);
+
+                if (vb->stride > 0) {
+                        max_index = MIN2(max_index,
+                                         (vb_size - elem_size) / vb->stride);
+                }
+        }
+
+        if (vtx->num_elements == 0) {
+                assert(num_elements_emit == 1);
+                struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
+                cl_reloc(vc4, &vc4->shader_rec, &shader_rec, bo, 0);
+                cl_u8(&shader_rec, 16 - 1); /* element size */
+                cl_u8(&shader_rec, 0); /* stride */
+                cl_u8(&shader_rec, 0); /* VS VPM offset */
+                cl_u8(&shader_rec, 0); /* CS VPM offset */
+                vc4_bo_unreference(&bo);
+        }
+        cl_end(&vc4->shader_rec, shader_rec);
+
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
+        /* the actual draw call. */
+        cl_u8(&bcl, VC4_PACKET_GL_SHADER_STATE);
+        assert(vtx->num_elements <= 8);
+        /* Note that number of attributes == 0 in the packet means 8
+         * attributes.  This field also contains the offset into shader_rec.
+         */
+        cl_u32(&bcl, num_elements_emit & 0x7);
+        cl_end(&vc4->bcl, bcl);
+
+        vc4_write_uniforms(vc4, vc4->prog.fs,
+                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
+                           &vc4->fragtex);
+        vc4_write_uniforms(vc4, vc4->prog.vs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           &vc4->verttex);
+        vc4_write_uniforms(vc4, vc4->prog.cs,
+                           &vc4->constbuf[PIPE_SHADER_VERTEX],
+                           &vc4->verttex);
+
+        vc4->last_index_bias = info->index_bias;
+        vc4->max_index = max_index;
+}
+
 static void
 vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 {
@@ -138,9 +246,6 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
         vc4_get_draw_cl_space(vc4);
 
-        struct vc4_vertex_stateobj *vtx = vc4->vtx;
-        struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;
-
         if (vc4->prim_mode != info->mode) {
                 vc4->prim_mode = info->mode;
                 vc4->dirty |= VC4_DIRTY_PRIM_MODE;
@@ -150,94 +255,27 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         vc4_update_compiled_shaders(vc4, info->mode);
 
         vc4_emit_state(pctx);
+
+        if ((vc4->dirty & (VC4_DIRTY_VTXBUF |
+                           VC4_DIRTY_VTXSTATE |
+                           VC4_DIRTY_PRIM_MODE |
+                           VC4_DIRTY_RASTERIZER |
+                           VC4_DIRTY_COMPILED_CS |
+                           VC4_DIRTY_COMPILED_VS |
+                           VC4_DIRTY_COMPILED_FS |
+                           vc4->prog.cs->uniform_dirty_bits |
+                           vc4->prog.vs->uniform_dirty_bits |
+                           vc4->prog.fs->uniform_dirty_bits)) ||
+            vc4->last_index_bias != info->index_bias) {
+                vc4_emit_gl_shader_state(vc4, info);
+        }
+
         vc4->dirty = 0;
 
-        vc4_write_uniforms(vc4, vc4->prog.fs,
-                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
-                           &vc4->fragtex);
-        vc4_write_uniforms(vc4, vc4->prog.vs,
-                           &vc4->constbuf[PIPE_SHADER_VERTEX],
-                           &vc4->verttex);
-        vc4_write_uniforms(vc4, vc4->prog.cs,
-                           &vc4->constbuf[PIPE_SHADER_VERTEX],
-                           &vc4->verttex);
-
-        /* The simulator throws a fit if VS or CS don't read an attribute, so
-         * we emit a dummy read.
-         */
-        uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);
-        /* Emit the shader record. */
-        cl_start_shader_reloc(&vc4->shader_rec, 3 + num_elements_emit);
-        cl_u16(&vc4->shader_rec,
-               VC4_SHADER_FLAG_ENABLE_CLIPPING |
-               ((info->mode == PIPE_PRIM_POINTS &&
-                 vc4->rasterizer->base.point_size_per_vertex) ?
-                VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
-        cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */
-        cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
-
-        cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */
-        cl_u8(&vc4->shader_rec, vc4->prog.vs->vattrs_live);
-        cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[8]);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
-
-        cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */
-        cl_u8(&vc4->shader_rec, vc4->prog.cs->vattrs_live);
-        cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[8]);
-        cl_reloc(vc4, &vc4->shader_rec, vc4->prog.cs->bo, 0);
-        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
-
-        uint32_t max_index = 0xffff;
-        uint32_t vpm_offset = 0;
-        for (int i = 0; i < vtx->num_elements; i++) {
-                struct pipe_vertex_element *elem = &vtx->pipe[i];
-                struct pipe_vertex_buffer *vb =
-                        &vertexbuf->vb[elem->vertex_buffer_index];
-                struct vc4_resource *rsc = vc4_resource(vb->buffer);
-                uint32_t offset = vb->buffer_offset + elem->src_offset;
-                uint32_t vb_size = rsc->bo->size - offset;
-                uint32_t elem_size =
-                        util_format_get_blocksize(elem->src_format);
-
-                cl_reloc(vc4, &vc4->shader_rec, rsc->bo, offset);
-                cl_u8(&vc4->shader_rec, elem_size - 1);
-                cl_u8(&vc4->shader_rec, vb->stride);
-                cl_u8(&vc4->shader_rec, vc4->prog.vs->vattr_offsets[i]);
-                cl_u8(&vc4->shader_rec, vc4->prog.cs->vattr_offsets[i]);
-
-                vpm_offset += align(elem_size, 4);
-
-                if (vb->stride > 0) {
-                        max_index = MIN2(max_index,
-                                         (vb_size - elem_size) / vb->stride);
-                }
-        }
-
-        if (vtx->num_elements == 0) {
-                assert(num_elements_emit == 1);
-                struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
-                cl_reloc(vc4, &vc4->shader_rec, bo, 0);
-                cl_u8(&vc4->shader_rec, 16 - 1); /* element size */
-                cl_u8(&vc4->shader_rec, 0); /* stride */
-                cl_u8(&vc4->shader_rec, 0); /* VS VPM offset */
-                cl_u8(&vc4->shader_rec, 0); /* CS VPM offset */
-                vc4_bo_unreference(&bo);
-        }
-
-        /* the actual draw call. */
-        cl_u8(&vc4->bcl, VC4_PACKET_GL_SHADER_STATE);
-        assert(vtx->num_elements <= 8);
-        /* Note that number of attributes == 0 in the packet means 8
-         * attributes.  This field also contains the offset into shader_rec.
-         */
-        cl_u32(&vc4->bcl, num_elements_emit & 0x7);
-
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
          */
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         if (info->indexed) {
                 uint32_t offset = vc4->indexbuf.offset;
                 uint32_t index_size = vc4->indexbuf.index_size;
@@ -251,25 +289,26 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 }
                 struct vc4_resource *rsc = vc4_resource(prsc);
 
-                cl_start_reloc(&vc4->bcl, 1);
-                cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
-                cl_u8(&vc4->bcl,
+                cl_start_reloc(&vc4->bcl, &bcl, 1);
+                cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
+                cl_u8(&bcl,
                       info->mode |
                       (index_size == 2 ?
                        VC4_INDEX_BUFFER_U16:
                        VC4_INDEX_BUFFER_U8));
-                cl_u32(&vc4->bcl, info->count);
-                cl_reloc(vc4, &vc4->bcl, rsc->bo, offset);
-                cl_u32(&vc4->bcl, max_index);
+                cl_u32(&bcl, info->count);
+                cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset);
+                cl_u32(&bcl, vc4->max_index);
 
                 if (vc4->indexbuf.index_size == 4)
                         pipe_resource_reference(&prsc, NULL);
         } else {
-                cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
-                cl_u8(&vc4->bcl, info->mode);
-                cl_u32(&vc4->bcl, info->count);
-                cl_u32(&vc4->bcl, info->start);
+                cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
+                cl_u8(&bcl, info->mode);
+                cl_u32(&bcl, info->count);
+                cl_u32(&bcl, info->start);
         }
+        cl_end(&vc4->bcl, bcl);
 
         if (vc4->zsa && vc4->zsa->base.depth.enabled) {
                 vc4->resolve |= PIPE_CLEAR_DEPTH;
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index 5f1ee4fa125..863ef8da8fb 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -31,12 +31,14 @@
 #define DRM_VC4_WAIT_BO                           0x02
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
+#define DRM_VC4_CREATE_SHADER_BO                  0x05
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
 #define DRM_IOCTL_VC4_WAIT_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo)
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+#define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 
 struct drm_vc4_submit_rcl_surface {
 	uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -182,6 +184,29 @@ struct drm_vc4_create_bo {
 	uint32_t pad;
 };
 
+/**
+ * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4
+ * shader BOs.
+ *
+ * Since allowing a shader to be overwritten while it's also being
+ * executed from would allow privlege escalation, shaders must be
+ * created using this ioctl, and they can't be mmapped later.
+ */
+struct drm_vc4_create_shader_bo {
+	/* Size of the data argument. */
+	uint32_t size;
+	/* Flags, currently must be 0. */
+	uint32_t flags;
+
+	/* Pointer to the data. */
+	uint64_t data;
+
+	/** Returned GEM handle for the BO. */
+	uint32_t handle;
+	/* Pad, must be 0. */
+	uint32_t pad;
+};
+
 /**
  * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs.
  *
diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c
index d2b54fccf91..ba064ff889b 100644
--- a/src/gallium/drivers/vc4/vc4_emit.c
+++ b/src/gallium/drivers/vc4/vc4_emit.c
@@ -28,23 +28,24 @@ vc4_emit_state(struct pipe_context *pctx)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
 
+        struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         if (vc4->dirty & (VC4_DIRTY_SCISSOR | VC4_DIRTY_VIEWPORT)) {
                 float *vpscale = vc4->viewport.scale;
                 float *vptranslate = vc4->viewport.translate;
-                float vp_minx = -fabs(vpscale[0]) + vptranslate[0];
-                float vp_maxx = fabs(vpscale[0]) + vptranslate[0];
-                float vp_miny = -fabs(vpscale[1]) + vptranslate[1];
-                float vp_maxy = fabs(vpscale[1]) + vptranslate[1];
+                float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
+                float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
+                float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
+                float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
                 uint32_t minx = MAX2(vc4->scissor.minx, vp_minx);
                 uint32_t miny = MAX2(vc4->scissor.miny, vp_miny);
                 uint32_t maxx = MIN2(vc4->scissor.maxx, vp_maxx);
                 uint32_t maxy = MIN2(vc4->scissor.maxy, vp_maxy);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIP_WINDOW);
-                cl_u16(&vc4->bcl, minx);
-                cl_u16(&vc4->bcl, miny);
-                cl_u16(&vc4->bcl, maxx - minx);
-                cl_u16(&vc4->bcl, maxy - miny);
+                cl_u8(&bcl, VC4_PACKET_CLIP_WINDOW);
+                cl_u16(&bcl, minx);
+                cl_u16(&bcl, miny);
+                cl_u16(&bcl, maxx - minx);
+                cl_u16(&bcl, maxy - miny);
 
                 vc4->draw_min_x = MIN2(vc4->draw_min_x, minx);
                 vc4->draw_min_y = MIN2(vc4->draw_min_y, miny);
@@ -53,47 +54,49 @@ vc4_emit_state(struct pipe_context *pctx)
         }
 
         if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) {
-                cl_u8(&vc4->bcl, VC4_PACKET_CONFIGURATION_BITS);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS);
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[0] |
                       vc4->zsa->config_bits[0]);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[1] |
                       vc4->zsa->config_bits[1]);
-                cl_u8(&vc4->bcl,
+                cl_u8(&bcl,
                       vc4->rasterizer->config_bits[2] |
                       vc4->zsa->config_bits[2]);
         }
 
         if (vc4->dirty & VC4_DIRTY_RASTERIZER) {
-                cl_u8(&vc4->bcl, VC4_PACKET_DEPTH_OFFSET);
-                cl_u16(&vc4->bcl, vc4->rasterizer->offset_factor);
-                cl_u16(&vc4->bcl, vc4->rasterizer->offset_units);
+                cl_u8(&bcl, VC4_PACKET_DEPTH_OFFSET);
+                cl_u16(&bcl, vc4->rasterizer->offset_factor);
+                cl_u16(&bcl, vc4->rasterizer->offset_units);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_POINT_SIZE);
-                cl_f(&vc4->bcl, vc4->rasterizer->point_size);
+                cl_u8(&bcl, VC4_PACKET_POINT_SIZE);
+                cl_f(&bcl, vc4->rasterizer->point_size);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_LINE_WIDTH);
-                cl_f(&vc4->bcl, vc4->rasterizer->base.line_width);
+                cl_u8(&bcl, VC4_PACKET_LINE_WIDTH);
+                cl_f(&bcl, vc4->rasterizer->base.line_width);
         }
 
         if (vc4->dirty & VC4_DIRTY_VIEWPORT) {
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_XY_SCALING);
-                cl_f(&vc4->bcl, vc4->viewport.scale[0] * 16.0f);
-                cl_f(&vc4->bcl, vc4->viewport.scale[1] * 16.0f);
+                cl_u8(&bcl, VC4_PACKET_CLIPPER_XY_SCALING);
+                cl_f(&bcl, vc4->viewport.scale[0] * 16.0f);
+                cl_f(&bcl, vc4->viewport.scale[1] * 16.0f);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_CLIPPER_Z_SCALING);
-                cl_f(&vc4->bcl, vc4->viewport.translate[2]);
-                cl_f(&vc4->bcl, vc4->viewport.scale[2]);
+                cl_u8(&bcl, VC4_PACKET_CLIPPER_Z_SCALING);
+                cl_f(&bcl, vc4->viewport.translate[2]);
+                cl_f(&bcl, vc4->viewport.scale[2]);
 
-                cl_u8(&vc4->bcl, VC4_PACKET_VIEWPORT_OFFSET);
-                cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[0]);
-                cl_u16(&vc4->bcl, 16 * vc4->viewport.translate[1]);
+                cl_u8(&bcl, VC4_PACKET_VIEWPORT_OFFSET);
+                cl_u16(&bcl, 16 * vc4->viewport.translate[0]);
+                cl_u16(&bcl, 16 * vc4->viewport.translate[1]);
         }
 
         if (vc4->dirty & VC4_DIRTY_FLAT_SHADE_FLAGS) {
-                cl_u8(&vc4->bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
-                cl_u32(&vc4->bcl, vc4->rasterizer->base.flatshade ?
+                cl_u8(&bcl, VC4_PACKET_FLAT_SHADE_FLAGS);
+                cl_u32(&bcl, vc4->rasterizer->base.flatshade ?
                        vc4->prog.fs->color_inputs : 0);
         }
+
+        cl_end(&vc4->bcl, bcl);
 }
diff --git a/src/gallium/drivers/vc4/vc4_fence.c b/src/gallium/drivers/vc4/vc4_fence.c
index f2ee91de61a..b6fb2a8a460 100644
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -59,16 +59,6 @@ vc4_fence_reference(struct pipe_screen *pscreen,
         *p = f;
 }
 
-static boolean
-vc4_fence_signalled(struct pipe_screen *pscreen,
-                    struct pipe_fence_handle *pf)
-{
-        struct vc4_screen *screen = vc4_screen(pscreen);
-        struct vc4_fence *f = (struct vc4_fence *)pf;
-
-        return vc4_wait_seqno(screen, f->seqno, 0);
-}
-
 static boolean
 vc4_fence_finish(struct pipe_screen *pscreen,
                  struct pipe_fence_handle *pf,
@@ -77,7 +67,7 @@ vc4_fence_finish(struct pipe_screen *pscreen,
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_fence *f = (struct vc4_fence *)pf;
 
-        return vc4_wait_seqno(screen, f->seqno, timeout_ns);
+        return vc4_wait_seqno(screen, f->seqno, timeout_ns, "fence wait");
 }
 
 struct vc4_fence *
@@ -98,6 +88,5 @@ void
 vc4_fence_init(struct vc4_screen *screen)
 {
         screen->base.fence_reference = vc4_fence_reference;
-        screen->base.fence_signalled = vc4_fence_signalled;
         screen->base.fence_finish = vc4_fence_finish;
 }
diff --git a/src/gallium/drivers/vc4/vc4_formats.c b/src/gallium/drivers/vc4/vc4_formats.c
index 004bac70c67..ffce61237de 100644
--- a/src/gallium/drivers/vc4/vc4_formats.c
+++ b/src/gallium/drivers/vc4/vc4_formats.c
@@ -108,7 +108,7 @@ static const struct vc4_format vc4_format_table[] = {
 static const struct vc4_format *
 get_format(enum pipe_format f)
 {
-        if (f > ARRAY_SIZE(vc4_format_table) ||
+        if (f >= ARRAY_SIZE(vc4_format_table) ||
             !vc4_format_table[f].present)
                 return NULL;
         else
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index dcade15443a..7ebd9f160eb 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -44,8 +44,7 @@ void
 vc4_job_reset(struct vc4_context *vc4)
 {
         struct vc4_bo **referenced_bos = vc4->bo_pointers.base;
-        for (int i = 0; i < (vc4->bo_handles.next -
-                             vc4->bo_handles.base) / 4; i++) {
+        for (int i = 0; i < cl_offset(&vc4->bo_handles) / 4; i++) {
                 vc4_bo_unreference(&referenced_bos[i]);
         }
         vc4_reset_cl(&vc4->bcl);
@@ -145,7 +144,7 @@ vc4_job_submit(struct vc4_context *vc4)
 {
         if (vc4_debug & VC4_DEBUG_CL) {
                 fprintf(stderr, "BCL:\n");
-                vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false);
+                vc4_dump_cl(vc4->bcl.base, cl_offset(&vc4->bcl), false);
         }
 
         struct drm_vc4_submit_cl submit;
@@ -164,15 +163,14 @@ vc4_job_submit(struct vc4_context *vc4)
                                      vc4->zs_write, true, true);
 
         submit.bo_handles = (uintptr_t)vc4->bo_handles.base;
-        submit.bo_handle_count = (vc4->bo_handles.next -
-                                  vc4->bo_handles.base) / 4;
+        submit.bo_handle_count = cl_offset(&vc4->bo_handles) / 4;
         submit.bin_cl = (uintptr_t)vc4->bcl.base;
-        submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base;
+        submit.bin_cl_size = cl_offset(&vc4->bcl);
         submit.shader_rec = (uintptr_t)vc4->shader_rec.base;
-        submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base;
+        submit.shader_rec_size = cl_offset(&vc4->shader_rec);
         submit.shader_rec_count = vc4->shader_rec_count;
         submit.uniforms = (uintptr_t)vc4->uniforms.base;
-        submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base;
+        submit.uniforms_size = cl_offset(&vc4->uniforms);
 
         assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
         submit.min_x_tile = vc4->draw_min_x / 64;
@@ -207,7 +205,7 @@ vc4_job_submit(struct vc4_context *vc4)
 
         if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) {
                 if (!vc4_wait_seqno(vc4->screen, vc4->last_emit_seqno,
-                                    PIPE_TIMEOUT_INFINITE)) {
+                                    PIPE_TIMEOUT_INFINITE, "sync")) {
                         fprintf(stderr, "Wait failed.\n");
                         abort();
                 }
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
new file mode 100644
index 00000000000..a372a6c0cdc
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Implements most of the fixed function fragment pipeline in shader code.
+ *
+ * VC4 doesn't have any hardware support for blending, alpha test, logic ops,
+ * or color mask.  Instead, you read the current contents of the destination
+ * from the tile buffer after having waited for the scoreboard (which is
+ * handled by vc4_qpu_emit.c), then do math using your output color and that
+ * destination value, and update the output color appropriately.
+ */
+
+/**
+ * Lowers fixed-function blending to a load of the destination color and a
+ * series of ALU operations before the store of the output.
+ */
+#include "util/u_format.h"
+#include "vc4_qir.h"
+#include "glsl/nir/nir_builder.h"
+#include "vc4_context.h"
+
+/** Emits a load of the previous fragment color from the tile buffer. */
+static nir_ssa_def *
+vc4_nir_get_dst_color(nir_builder *b)
+{
+        nir_intrinsic_instr *load =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_input);
+        load->num_components = 1;
+        load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT;
+        nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+        nir_builder_instr_insert(b, &load->instr);
+        return &load->dest.ssa;
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb)
+{
+        nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045));
+        nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92));
+        nir_ssa_def *high = nir_fpow(b,
+                                     nir_fmul(b,
+                                              nir_fadd(b, srgb,
+                                                       nir_imm_float(b, 0.055)),
+                                              nir_imm_float(b, 1.0 / 1.055)),
+                                     nir_imm_float(b, 2.4));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static  nir_ssa_def *
+vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear)
+{
+        nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308));
+        nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92));
+        nir_ssa_def *high = nir_fsub(b,
+                                     nir_fmul(b,
+                                              nir_imm_float(b, 1.055),
+                                              nir_fpow(b,
+                                                       linear,
+                                                       nir_imm_float(b, 0.41666))),
+                                     nir_imm_float(b, 0.055));
+
+        return nir_bcsel(b, is_low, low, high);
+}
+
+static nir_ssa_def *
+vc4_blend_channel(nir_builder *b,
+                  nir_ssa_def **src,
+                  nir_ssa_def **dst,
+                  unsigned factor,
+                  int channel)
+{
+        switch(factor) {
+        case PIPE_BLENDFACTOR_ONE:
+                return nir_imm_float(b, 1.0);
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return src[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return src[3];
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return dst[3];
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return dst[channel];
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                if (channel != 3) {
+                        return nir_fmin(b,
+                                        src[3],
+                                        nir_fsub(b,
+                                                 nir_imm_float(b, 1.0),
+                                                 dst[3]));
+                } else {
+                        return nir_imm_float(b, 1.0);
+                }
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel);
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W);
+        case PIPE_BLENDFACTOR_ZERO:
+                return nir_imm_float(b, 0.0);
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[channel]);
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), src[3]);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[3]);
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0), dst[channel]);
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_X + channel));
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return nir_fsub(b, nir_imm_float(b, 1.0),
+                                vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_W));
+
+        default:
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA:
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend factor %d\n", factor);
+                return nir_imm_float(b, 1.0);
+        }
+}
+
+static nir_ssa_def *
+vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+               unsigned func)
+{
+        switch (func) {
+        case PIPE_BLEND_ADD:
+                return nir_fadd(b, src, dst);
+        case PIPE_BLEND_SUBTRACT:
+                return nir_fsub(b, src, dst);
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+                return nir_fsub(b, dst, src);
+        case PIPE_BLEND_MIN:
+                return nir_fmin(b, src, dst);
+        case PIPE_BLEND_MAX:
+                return nir_fmax(b, src, dst);
+
+        default:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend func %d\n", func);
+                return src;
+
+        }
+}
+
+static void
+vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
+                nir_ssa_def **src_color, nir_ssa_def **dst_color)
+{
+        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
+
+        if (!blend->blend_enable) {
+                for (int i = 0; i < 4; i++)
+                        result[i] = src_color[i];
+                return;
+        }
+
+        /* Clamp the src color to [0, 1].  Dest is already clamped. */
+        for (int i = 0; i < 4; i++)
+                src_color[i] = nir_fsat(b, src_color[i]);
+
+        nir_ssa_def *src_blend[4], *dst_blend[4];
+        for (int i = 0; i < 4; i++) {
+                int src_factor = ((i != 3) ? blend->rgb_src_factor :
+                                  blend->alpha_src_factor);
+                int dst_factor = ((i != 3) ? blend->rgb_dst_factor :
+                                  blend->alpha_dst_factor);
+                src_blend[i] = nir_fmul(b, src_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          src_factor, i));
+                dst_blend[i] = nir_fmul(b, dst_color[i],
+                                        vc4_blend_channel(b,
+                                                          src_color, dst_color,
+                                                          dst_factor, i));
+        }
+
+        for (int i = 0; i < 4; i++) {
+                result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i],
+                                           ((i != 3) ? blend->rgb_func :
+                                            blend->alpha_func));
+        }
+}
+
+static nir_ssa_def *
+vc4_logicop(nir_builder *b, int logicop_func,
+            nir_ssa_def *src, nir_ssa_def *dst)
+{
+        switch (logicop_func) {
+        case PIPE_LOGICOP_CLEAR:
+                return nir_imm_int(b, 0);
+        case PIPE_LOGICOP_NOR:
+                return nir_inot(b, nir_ior(b, src, dst));
+        case PIPE_LOGICOP_AND_INVERTED:
+                return nir_iand(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_COPY_INVERTED:
+                return nir_inot(b, src);
+        case PIPE_LOGICOP_AND_REVERSE:
+                return nir_iand(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_INVERT:
+                return nir_inot(b, dst);
+        case PIPE_LOGICOP_XOR:
+                return nir_ixor(b, src, dst);
+        case PIPE_LOGICOP_NAND:
+                return nir_inot(b, nir_iand(b, src, dst));
+        case PIPE_LOGICOP_AND:
+                return nir_iand(b, src, dst);
+        case PIPE_LOGICOP_EQUIV:
+                return nir_inot(b, nir_ixor(b, src, dst));
+        case PIPE_LOGICOP_NOOP:
+                return dst;
+        case PIPE_LOGICOP_OR_INVERTED:
+                return nir_ior(b, nir_inot(b, src), dst);
+        case PIPE_LOGICOP_OR_REVERSE:
+                return nir_ior(b, src, nir_inot(b, dst));
+        case PIPE_LOGICOP_OR:
+                return nir_ior(b, src, dst);
+        case PIPE_LOGICOP_SET:
+                return nir_imm_int(b, ~0);
+        default:
+                fprintf(stderr, "Unknown logic op %d\n", logicop_func);
+                /* FALLTHROUGH */
+        case PIPE_LOGICOP_COPY:
+                return src;
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_pipe_compare_func(nir_builder *b, int func,
+                          nir_ssa_def *src0, nir_ssa_def *src1)
+{
+        switch (func) {
+        default:
+                fprintf(stderr, "Unknown compare func %d\n", func);
+                /* FALLTHROUGH */
+        case PIPE_FUNC_NEVER:
+                return nir_imm_int(b, 0);
+        case PIPE_FUNC_ALWAYS:
+                return nir_imm_int(b, ~0);
+        case PIPE_FUNC_EQUAL:
+                return nir_feq(b, src0, src1);
+        case PIPE_FUNC_NOTEQUAL:
+                return nir_fne(b, src0, src1);
+        case PIPE_FUNC_GREATER:
+                return nir_flt(b, src1, src0);
+        case PIPE_FUNC_GEQUAL:
+                return nir_fge(b, src0, src1);
+        case PIPE_FUNC_LESS:
+                return nir_flt(b, src0, src1);
+        case PIPE_FUNC_LEQUAL:
+                return nir_fge(b, src1, src0);
+        }
+}
+
+static void
+vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
+                                nir_ssa_def *alpha)
+{
+        if (!c->fs_key->alpha_test)
+                return;
+
+        nir_ssa_def *alpha_ref =
+                vc4_nir_get_state_uniform(b, QUNIFORM_ALPHA_REF);
+        nir_ssa_def *condition =
+                vc4_nir_pipe_compare_func(b, c->fs_key->alpha_test_func,
+                                          alpha, alpha_ref);
+
+        nir_intrinsic_instr *discard =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_discard_if);
+        discard->num_components = 1;
+        discard->src[0] = nir_src_for_ssa(nir_inot(b, condition));
+        nir_builder_instr_insert(b, &discard->instr);
+}
+
+static void
+vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
+                          nir_intrinsic_instr *intr)
+{
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+
+        /* Pull out the float src/dst color components. */
+        nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
+        nir_ssa_def *dst_vec4 = nir_unpack_unorm_4x8(b, packed_dst_color);
+        nir_ssa_def *src_color[4], *unpacked_dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                src_color[i] = nir_swizzle(b, intr->src[0].ssa, &i, 1, false);
+                unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
+        }
+
+        /* Unswizzle the destination color. */
+        nir_ssa_def *dst_color[4];
+        for (unsigned i = 0; i < 4; i++) {
+                dst_color[i] = vc4_nir_get_swizzled_channel(b,
+                                                            unpacked_dst_color,
+                                                            format_swiz[i]);
+        }
+
+        vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
+
+        /* Turn dst color to linear. */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]);
+        }
+
+        nir_ssa_def *blend_color[4];
+        vc4_do_blending(c, b, blend_color, src_color, dst_color);
+
+        /* sRGB encode the output color */
+        if (util_format_is_srgb(color_format)) {
+                for (int i = 0; i < 3; i++)
+                        blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]);
+        }
+
+        nir_ssa_def *swizzled_outputs[4];
+        for (int i = 0; i < 4; i++) {
+                swizzled_outputs[i] =
+                        vc4_nir_get_swizzled_channel(b, blend_color,
+                                                     format_swiz[i]);
+        }
+
+        nir_ssa_def *packed_color =
+                nir_pack_unorm_4x8(b,
+                                   nir_vec4(b,
+                                            swizzled_outputs[0],
+                                            swizzled_outputs[1],
+                                            swizzled_outputs[2],
+                                            swizzled_outputs[3]));
+
+        packed_color = vc4_logicop(b, c->fs_key->logicop_func,
+                                   packed_color, packed_dst_color);
+
+        /* If the bit isn't set in the color mask, then just return the
+         * original dst color, instead.
+         */
+        uint32_t colormask = 0xffffffff;
+        for (int i = 0; i < 4; i++) {
+                if (format_swiz[i] < 4 &&
+                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
+                        colormask &= ~(0xff << (i * 8));
+                }
+        }
+        packed_color = nir_ior(b,
+                               nir_iand(b, packed_color,
+                                        nir_imm_int(b, colormask)),
+                               nir_iand(b, packed_dst_color,
+                                        nir_imm_int(b, ~colormask)));
+
+        /* Turn the old vec4 output into a store of the packed color. */
+        nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+                              nir_src_for_ssa(packed_color));
+        intr->num_components = 1;
+}
+
+static bool
+vc4_nir_lower_blend_block(nir_block *block, void *state)
+{
+        struct vc4_compile *c = state;
+
+        nir_foreach_instr(block, instr) {
+                if (instr->type != nir_instr_type_intrinsic)
+                        continue;
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                if (intr->intrinsic != nir_intrinsic_store_output)
+                        continue;
+
+                nir_variable *output_var = NULL;
+                foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                        if (var->data.driver_location == intr->const_index[0]) {
+                                output_var = var;
+                                break;
+                        }
+                }
+                assert(output_var);
+                unsigned semantic_name = output_var->data.location;
+
+                if (semantic_name != TGSI_SEMANTIC_COLOR)
+                        continue;
+
+                nir_function_impl *impl =
+                        nir_cf_node_get_function(&block->cf_node);
+                nir_builder b;
+                nir_builder_init(&b, impl);
+                nir_builder_insert_before_instr(&b, &intr->instr);
+                vc4_nir_lower_blend_instr(c, &b, intr);
+        }
+        return true;
+}
+
+void
+vc4_nir_lower_blend(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl) {
+                        nir_foreach_block(overload->impl,
+                                          vc4_nir_lower_blend_block, c);
+
+                        nir_metadata_preserve(overload->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
new file mode 100644
index 00000000000..229d41147d8
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "vc4_qir.h"
+#include "tgsi/tgsi_info.h"
+#include "glsl/nir/nir_builder.h"
+
+/**
+ * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into
+ * something amenable to the VC4 architecture.
+ *
+ * Currently, it split inputs, outputs, and uniforms into scalars, drops any
+ * non-position outputs in coordinate shaders, and fixes up the addressing on
+ * indirect uniform loads.
+ */
+
+static void
+replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr,
+                            nir_ssa_def **comps)
+{
+
+        /* Batch things back together into a vec4.  This will get split by the
+         * later ALU scalarization pass.
+         */
+        nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]);
+
+        /* Replace the old intrinsic with a reference to our reconstructed
+         * vec4.
+         */
+        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec),
+                                 ralloc_parent(b->impl));
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
+                    nir_intrinsic_instr *intr)
+{
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
+            VC4_NIR_TLB_COLOR_READ_INPUT) {
+                /* This doesn't need any lowering. */
+                return;
+        }
+
+        nir_variable *input_var = NULL;
+        foreach_list_typed(nir_variable, var, node, &c->s->inputs) {
+                if (var->data.driver_location == intr->const_index[0]) {
+                        input_var = var;
+                        break;
+                }
+        }
+        assert(input_var);
+        int semantic_name = input_var->data.location;
+        int semantic_index = input_var->data.index;
+
+        /* All TGSI-to-NIR inputs are vec4. */
+        assert(intr->num_components == 4);
+
+        /* Generate scalar loads equivalent to the original VEC4. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, nir_intrinsic_load_input);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_builder_instr_insert(b, &intr_comp->instr);
+
+                dests[i] = &intr_comp->dest.ssa;
+        }
+
+        switch (c->stage) {
+        case QSTAGE_FRAG:
+                switch (semantic_name) {
+                case TGSI_SEMANTIC_FACE:
+                        dests[0] = nir_fsub(b,
+                                            nir_imm_float(b, 1.0),
+                                            nir_fmul(b,
+                                                     nir_i2f(b, dests[0]),
+                                                     nir_imm_float(b, 2.0)));
+                        dests[1] = nir_imm_float(b, 0.0);
+                        dests[2] = nir_imm_float(b, 0.0);
+                        dests[3] = nir_imm_float(b, 1.0);
+                        break;
+                case TGSI_SEMANTIC_GENERIC:
+                        if (c->fs_key->point_sprite_mask &
+                            (1 << semantic_index)) {
+                                if (!c->fs_key->is_points) {
+                                        dests[0] = nir_imm_float(b, 0.0);
+                                        dests[1] = nir_imm_float(b, 0.0);
+                                }
+                                if (c->fs_key->point_coord_upper_left) {
+                                        dests[1] = nir_fsub(b,
+                                                            nir_imm_float(b, 1.0),
+                                                            dests[1]);
+                                }
+                                dests[2] = nir_imm_float(b, 0.0);
+                                dests[3] = nir_imm_float(b, 1.0);
+                        }
+                        break;
+                }
+                break;
+        case QSTAGE_COORD:
+        case QSTAGE_VERT:
+                break;
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
+vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+        nir_variable *output_var = NULL;
+        foreach_list_typed(nir_variable, var, node, &c->s->outputs) {
+                if (var->data.driver_location == intr->const_index[0]) {
+                        output_var = var;
+                        break;
+                }
+        }
+        assert(output_var);
+        unsigned semantic_name = output_var->data.location;
+
+        if (c->stage == QSTAGE_COORD &&
+            (semantic_name != TGSI_SEMANTIC_POSITION &&
+             semantic_name != TGSI_SEMANTIC_PSIZE)) {
+                nir_instr_remove(&intr->instr);
+                return;
+        }
+
+        /* Color output is lowered by vc4_nir_lower_blend(). */
+        if (c->stage == QSTAGE_FRAG && semantic_name == TGSI_SEMANTIC_COLOR) {
+                intr->const_index[0] *= 4;
+                return;
+        }
+
+        /* All TGSI-to-NIR outputs are VEC4. */
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output);
+                intr_comp->num_components = 1;
+                intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
+
+                assert(intr->src[0].is_ssa);
+                intr_comp->src[0] = nir_src_for_ssa(nir_swizzle(b,
+                                                                intr->src[0].ssa,
+                                                                &i, 1, false));
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
+                      nir_intrinsic_instr *intr)
+{
+        /* All TGSI-to-NIR uniform loads are vec4, but we may create dword
+         * loads in our lowering passes.
+         */
+        if (intr->num_components == 1)
+                return;
+        assert(intr->num_components == 4);
+
+        nir_builder_insert_before_instr(b, &intr->instr);
+
+        /* Generate scalar loads equivalent to the original VEC4. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, intr->intrinsic);
+                intr_comp->num_components = 1;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+
+                if (intr->intrinsic == nir_intrinsic_load_uniform_indirect) {
+                        /* Convert the variable TGSI register index to a byte
+                         * offset.
+                         */
+                        intr_comp->src[0] =
+                                nir_src_for_ssa(nir_ishl(b,
+                                                         intr->src[0].ssa,
+                                                         nir_imm_int(b, 4)));
+
+                        /* Convert the offset to be a byte index, too. */
+                        intr_comp->const_index[0] = (intr->const_index[0] * 16 +
+                                                     i * 4);
+                } else {
+                        /* We want a dword index for non-indirect uniform
+                         * loads.
+                         */
+                        intr_comp->const_index[0] = (intr->const_index[0] * 4 +
+                                                     i);
+                }
+
+                dests[i] = &intr_comp->dest.ssa;
+
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        replace_intrinsic_with_vec4(b, intr, dests);
+}
+
+static void
+vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
+                       struct nir_instr *instr)
+{
+        if (instr->type != nir_instr_type_intrinsic)
+                return;
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_input:
+                vc4_nir_lower_input(c, b, intr);
+                break;
+
+        case nir_intrinsic_store_output:
+                vc4_nir_lower_output(c, b, intr);
+                break;
+
+        case nir_intrinsic_load_uniform:
+        case nir_intrinsic_load_uniform_indirect:
+                vc4_nir_lower_uniform(c, b, intr);
+                break;
+
+        default:
+                break;
+        }
+}
+
+static bool
+vc4_nir_lower_io_block(nir_block *block, void *arg)
+{
+        struct vc4_compile *c = arg;
+        nir_function_impl *impl =
+                nir_cf_node_get_function(&block->cf_node);
+
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_instr_safe(block, instr)
+                vc4_nir_lower_io_instr(c, &b, instr);
+
+        return true;
+}
+
+static bool
+vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl)
+{
+        nir_foreach_block(impl, vc4_nir_lower_io_block, c);
+
+        nir_metadata_preserve(impl, nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+vc4_nir_lower_io(struct vc4_compile *c)
+{
+        nir_foreach_overload(c->s, overload) {
+                if (overload->impl)
+                        vc4_nir_lower_io_impl(c, overload->impl);
+        }
+}
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index d6d2fbf257f..a755de9aa41 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -67,10 +67,7 @@ qir_opt_copy_propagation(struct vc4_compile *c)
 
                 if (inst->op == QOP_MOV &&
                     inst->dst.file == QFILE_TEMP &&
-                    inst->src[0].file != QFILE_VPM &&
-                    !(inst->src[0].file == QFILE_TEMP &&
-                      (c->defs[inst->src[0].index]->op == QOP_TEX_RESULT ||
-                       c->defs[inst->src[0].index]->op == QOP_TLB_COLOR_READ))) {
+                    inst->src[0].file != QFILE_VPM) {
                         movs[inst->dst.index] = inst->src[0];
                 }
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 92c8260eb59..0e5480ea781 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -46,8 +46,7 @@ struct inst_key {
         struct qreg src[4];
         /**
          * If the instruction depends on the flags, how many SFs have been
-         * seen before this instruction, or if it depends on r4, how many r4
-         * writes have been seen.
+         * seen before this instruction.
          */
         uint32_t implicit_arg_update_count;
 };
@@ -63,8 +62,7 @@ inst_key_equals(const void *a, const void *b)
 
 static struct qinst *
 vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
-             struct qinst *inst, uint32_t sf_count,
-             uint32_t r4_count)
+             struct qinst *inst, uint32_t sf_count)
 {
         if (inst->dst.file != QFILE_TEMP ||
             inst->op == QOP_MOV ||
@@ -79,8 +77,6 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
                qir_get_op_nsrc(inst->op) * sizeof(key.src[0]));
         if (qir_depends_on_flags(inst))
                 key.implicit_arg_update_count = sf_count;
-        if (qir_reads_r4(inst))
-                key.implicit_arg_update_count = r4_count;
 
         uint32_t hash = _mesa_hash_data(&key, sizeof(key));
         struct hash_entry *entry =
@@ -121,7 +117,7 @@ bool
 qir_opt_cse(struct vc4_compile *c)
 {
         bool progress = false;
-        uint32_t sf_count = 0, r4_count = 0;
+        uint32_t sf_count = 0;
 
         struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
                                                         inst_key_equals);
@@ -130,15 +126,15 @@ qir_opt_cse(struct vc4_compile *c)
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (qir_has_side_effects(c, inst) ||
-                    qir_has_side_effect_reads(c, inst)) {
+                    qir_has_side_effect_reads(c, inst) ||
+                    inst->op == QOP_TLB_COLOR_READ) {
                         continue;
                 }
 
                 if (inst->sf) {
                         sf_count++;
                 } else {
-                        struct qinst *cse = vc4_find_cse(c, ht, inst,
-                                                         sf_count, r4_count);
+                        struct qinst *cse = vc4_find_cse(c, ht, inst, sf_count);
                         if (cse) {
                                 inst->src[0] = cse->dst;
                                 for (int i = 1; i < qir_get_op_nsrc(inst->op);
@@ -154,9 +150,6 @@ qir_opt_cse(struct vc4_compile *c)
                                 }
                         }
                 }
-
-                if (qir_writes_r4(inst))
-                        r4_count++;
         }
 
         ralloc_free(ht);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ba47c51d9bd..13c472152d8 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -23,21 +23,19 @@
  */
 
 #include <inttypes.h>
-#include "pipe/p_state.h"
 #include "util/u_format.h"
 #include "util/u_hash.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_pack_color.h"
-#include "util/format_srgb.h"
 #include "util/ralloc.h"
 #include "util/hash_table.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_lowering.h"
 #include "tgsi/tgsi_parse.h"
+#include "glsl/nir/nir.h"
+#include "glsl/nir/nir_builder.h"
 #include "nir/tgsi_to_nir.h"
-
 #include "vc4_context.h"
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
@@ -45,51 +43,8 @@
 #include "simpenrose/simpenrose.h"
 #endif
 
-struct vc4_key {
-        struct vc4_uncompiled_shader *shader_state;
-        struct {
-                enum pipe_format format;
-                unsigned compare_mode:1;
-                unsigned compare_func:3;
-                unsigned wrap_s:3;
-                unsigned wrap_t:3;
-                uint8_t swizzle[4];
-        } tex[VC4_MAX_TEXTURE_SAMPLERS];
-        uint8_t ucp_enables;
-};
-
-struct vc4_fs_key {
-        struct vc4_key base;
-        enum pipe_format color_format;
-        bool depth_enabled;
-        bool stencil_enabled;
-        bool stencil_twoside;
-        bool stencil_full_writemasks;
-        bool is_points;
-        bool is_lines;
-        bool alpha_test;
-        bool point_coord_upper_left;
-        bool light_twoside;
-        uint8_t alpha_test_func;
-        uint8_t logicop_func;
-        uint32_t point_sprite_mask;
-
-        struct pipe_rt_blend_state blend;
-};
-
-struct vc4_vs_key {
-        struct vc4_key base;
-
-        /**
-         * This is a proxy for the array of FS input semantics, which is
-         * larger than we would want to put in the key.
-         */
-        uint64_t compiled_fs_id;
-
-        enum pipe_format attr_formats[8];
-        bool is_coord;
-        bool per_vertex_point_size;
-};
+static struct qreg
+ntq_get_src(struct vc4_compile *c, nir_src src, int i);
 
 static void
 resize_qreg_array(struct vc4_compile *c,
@@ -113,10 +68,10 @@ resize_qreg_array(struct vc4_compile *c,
 }
 
 static struct qreg
-indirect_uniform_load(struct vc4_compile *c,
-                      struct qreg indirect_offset,
-                      unsigned offset)
+indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
 {
+        struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
+        uint32_t offset = intr->const_index[0];
         struct vc4_compiler_ubo_range *range = NULL;
         unsigned i;
         for (i = 0; i < c->num_uniform_ranges; i++) {
@@ -138,10 +93,6 @@ indirect_uniform_load(struct vc4_compile *c,
         };
 
         offset -= range->src_offset;
-        /* Translate the user's TGSI register index from the TGSI register
-         * base to a byte offset.
-         */
-        indirect_offset = qir_SHL(c, indirect_offset, qir_uniform_ui(c, 4));
 
         /* Adjust for where we stored the TGSI register base. */
         indirect_offset = qir_ADD(c, indirect_offset,
@@ -155,24 +106,70 @@ indirect_uniform_load(struct vc4_compile *c,
                                                      range->size - 4)));
 
         qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
-        struct qreg r4 = qir_TEX_RESULT(c);
         c->num_texture_samples++;
-        return qir_MOV(c, r4);
+        return qir_TEX_RESULT(c);
+}
+
+nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
+                                       enum quniform_contents contents)
+{
+        nir_intrinsic_instr *intr =
+                nir_intrinsic_instr_create(b->shader,
+                                           nir_intrinsic_load_uniform);
+        intr->const_index[0] = VC4_NIR_STATE_UNIFORM_OFFSET + contents;
+        intr->num_components = 1;
+        nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL);
+        nir_builder_instr_insert(b, &intr->instr);
+        return &intr->dest.ssa;
+}
+
+nir_ssa_def *
+vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+{
+        switch (swiz) {
+        default:
+        case UTIL_FORMAT_SWIZZLE_NONE:
+                fprintf(stderr, "warning: unknown swizzle\n");
+                /* FALLTHROUGH */
+        case UTIL_FORMAT_SWIZZLE_0:
+                return nir_imm_float(b, 0.0);
+        case UTIL_FORMAT_SWIZZLE_1:
+                return nir_imm_float(b, 1.0);
+        case UTIL_FORMAT_SWIZZLE_X:
+        case UTIL_FORMAT_SWIZZLE_Y:
+        case UTIL_FORMAT_SWIZZLE_Z:
+        case UTIL_FORMAT_SWIZZLE_W:
+                return srcs[swiz];
+        }
 }
 
 static struct qreg *
-ntq_get_dest(struct vc4_compile *c, nir_dest dest)
+ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
 {
-        assert(!dest.is_ssa);
-        nir_register *reg = dest.reg.reg;
-        struct hash_entry *entry = _mesa_hash_table_search(c->def_ht, reg);
-        assert(reg->num_array_elems == 0);
-        assert(dest.reg.base_offset == 0);
-
-        struct qreg *qregs = entry->data;
+        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
+                                          def->num_components);
+        _mesa_hash_table_insert(c->def_ht, def, qregs);
         return qregs;
 }
 
+static struct qreg *
+ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
+{
+        if (dest->is_ssa) {
+                struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);
+                for (int i = 0; i < dest->ssa.num_components; i++)
+                        qregs[i] = c->undef;
+                return qregs;
+        } else {
+                nir_register *reg = dest->reg.reg;
+                assert(dest->reg.base_offset == 0);
+                assert(reg->num_array_elems == 0);
+                struct hash_entry *entry =
+                        _mesa_hash_table_search(c->def_ht, reg);
+                return entry->data;
+        }
+}
+
 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
 {
@@ -281,22 +278,6 @@ qir_srgb_decode(struct vc4_compile *c, struct qreg srgb)
         return qir_SEL_X_Y_NS(c, low, high);
 }
 
-static struct qreg
-qir_srgb_encode(struct vc4_compile *c, struct qreg linear)
-{
-        struct qreg low = qir_FMUL(c, linear, qir_uniform_f(c, 12.92));
-        struct qreg high = qir_FSUB(c,
-                                    qir_FMUL(c,
-                                             qir_uniform_f(c, 1.055),
-                                             qir_POW(c,
-                                                     linear,
-                                                     qir_uniform_f(c, 0.41666))),
-                                    qir_uniform_f(c, 0.055));
-
-        qir_SF(c, qir_FSUB(c, linear, qir_uniform_f(c, 0.0031308)));
-        return qir_SEL_X_Y_NS(c, low, high);
-}
-
 static struct qreg
 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
 {
@@ -410,13 +391,13 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
         qir_TEX_S(c, s, texture_u[next_texture_u++]);
 
         c->num_texture_samples++;
-        struct qreg r4 = qir_TEX_RESULT(c);
+        struct qreg tex = qir_TEX_RESULT(c);
 
         enum pipe_format format = c->key->tex[unit].format;
 
         struct qreg unpacked[4];
         if (util_format_is_depth_or_stencil(format)) {
-                struct qreg depthf = qir_ITOF(c, qir_SHR(c, r4,
+                struct qreg depthf = qir_ITOF(c, qir_SHR(c, tex,
                                                          qir_uniform_ui(c, 8)));
                 struct qreg normalized = qir_FMUL(c, depthf,
                                                   qir_uniform_f(c, 1.0f/0xffffff));
@@ -468,7 +449,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                         unpacked[i] = depth_output;
         } else {
                 for (int i = 0; i < 4; i++)
-                        unpacked[i] = qir_R4_UNPACK(c, r4, i);
+                        unpacked[i] = qir_UNPACK_8_F(c, tex, i);
         }
 
         const uint8_t *format_swiz = vc4_get_format_swizzle(format);
@@ -484,7 +465,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                                                             texture_output[i]);
         }
 
-        struct qreg *dest = ntq_get_dest(c, instr->dest);
+        struct qreg *dest = ntq_get_dest(c, &instr->dest);
         for (int i = 0; i < 4; i++) {
                 dest[i] = get_swizzled_channel(c, texture_output,
                                                c->key->tex[unit].swizzle[i]);
@@ -558,7 +539,7 @@ ntq_fsin(struct vc4_compile *c, struct qreg src)
         struct qreg scaled_x =
                 qir_FMUL(c,
                          src,
-                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
+                         qir_uniform_f(c, 1.0 / (M_PI * 2.0)));
 
         struct qreg x = qir_FADD(c,
                                  ntq_ffract(c, scaled_x),
@@ -756,26 +737,6 @@ emit_fragcoord_input(struct vc4_compile *c, int attr)
         c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
 }
 
-static void
-emit_point_coord_input(struct vc4_compile *c, int attr)
-{
-        if (c->point_x.file == QFILE_NULL) {
-                c->point_x = qir_uniform_f(c, 0.0);
-                c->point_y = qir_uniform_f(c, 0.0);
-        }
-
-        c->inputs[attr * 4 + 0] = c->point_x;
-        if (c->fs_key->point_coord_upper_left) {
-                c->inputs[attr * 4 + 1] = qir_FSUB(c,
-                                                   qir_uniform_f(c, 1.0),
-                                                   c->point_y);
-        } else {
-                c->inputs[attr * 4 + 1] = c->point_y;
-        }
-        c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
-}
-
 static struct qreg
 emit_fragment_varying(struct vc4_compile *c, uint8_t semantic,
                       uint8_t index, uint8_t swizzle)
@@ -816,19 +777,6 @@ emit_fragment_input(struct vc4_compile *c, int attr,
         }
 }
 
-static void
-emit_face_input(struct vc4_compile *c, int attr)
-{
-        c->inputs[attr * 4 + 0] = qir_FSUB(c,
-                                           qir_uniform_f(c, 1.0),
-                                           qir_FMUL(c,
-                                                    qir_ITOF(c, qir_FRAG_REV_FLAG(c)),
-                                                    qir_uniform_f(c, 2.0)));
-        c->inputs[attr * 4 + 1] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 2] = qir_uniform_f(c, 0.0);
-        c->inputs[attr * 4 + 3] = qir_uniform_f(c, 1.0);
-}
-
 static void
 add_output(struct vc4_compile *c,
            uint32_t decl_offset,
@@ -884,12 +832,38 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
                         srcs[i] = ntq_get_src(c, instr->src[i].src,
                                               instr->src[i].swizzle[0]);
-                struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
                         dest[i] = srcs[i];
                 return;
         }
 
+        if (instr->op == nir_op_pack_unorm_4x8) {
+                struct qreg result;
+                for (int i = 0; i < 4; i++) {
+                        struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                                      instr->src[0].swizzle[i]);
+                        if (i == 0)
+                                result = qir_PACK_8888_F(c, src);
+                        else
+                                result = qir_PACK_8_F(c, result, src, i);
+                }
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                *dest = result;
+                return;
+        }
+
+        if (instr->op == nir_op_unpack_unorm_4x8) {
+                struct qreg src = ntq_get_src(c, instr->src[0].src,
+                                              instr->src[0].swizzle[0]);
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                for (int i = 0; i < 4; i++) {
+                        if (instr->dest.write_mask & (1 << i))
+                                dest[i] = qir_UNPACK_8_F(c, src, i);
+                }
+                return;
+        }
+
         /* General case: We can just grab the one used channel per src. */
         struct qreg src[nir_op_infos[instr->op].num_inputs];
         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
@@ -898,7 +872,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 
         /* Pick the channel to store the output in. */
         assert(!instr->dest.saturate);
-        struct qreg *dest = ntq_get_dest(c, instr->dest.dest);
+        struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
         assert(util_is_power_of_two(instr->dest.write_mask));
         dest += ffs(instr->dest.write_mask) - 1;
 
@@ -1092,167 +1066,6 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         }
 }
 
-static struct qreg
-vc4_blend_channel(struct vc4_compile *c,
-                  struct qreg *dst,
-                  struct qreg *src,
-                  struct qreg val,
-                  unsigned factor,
-                  int channel)
-{
-        switch(factor) {
-        case PIPE_BLENDFACTOR_ONE:
-                return val;
-        case PIPE_BLENDFACTOR_SRC_COLOR:
-                return qir_FMUL(c, val, src[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA:
-                return qir_FMUL(c, val, src[3]);
-        case PIPE_BLENDFACTOR_DST_ALPHA:
-                return qir_FMUL(c, val, dst[3]);
-        case PIPE_BLENDFACTOR_DST_COLOR:
-                return qir_FMUL(c, val, dst[channel]);
-        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-                if (channel != 3) {
-                        return qir_FMUL(c,
-                                        val,
-                                        qir_FMIN(c,
-                                                 src[3],
-                                                 qir_FSUB(c,
-                                                          qir_uniform_f(c, 1.0),
-                                                          dst[3])));
-                } else {
-                        return val;
-                }
-        case PIPE_BLENDFACTOR_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR,
-                                            channel));
-        case PIPE_BLENDFACTOR_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR, 3));
-        case PIPE_BLENDFACTOR_ZERO:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[channel]));
-        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 src[3]));
-        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[3]));
-        case PIPE_BLENDFACTOR_INV_DST_COLOR:
-                return qir_FMUL(c, val, qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                                 dst[channel]));
-        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     channel)));
-        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-                return qir_FMUL(c, val,
-                                qir_FSUB(c, qir_uniform_f(c, 1.0),
-                                         qir_uniform(c,
-                                                     QUNIFORM_BLEND_CONST_COLOR,
-                                                     3)));
-
-        default:
-        case PIPE_BLENDFACTOR_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_SRC1_ALPHA:
-        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend factor %d\n", factor);
-                return val;
-        }
-}
-
-static struct qreg
-vc4_blend_func(struct vc4_compile *c,
-               struct qreg src, struct qreg dst,
-               unsigned func)
-{
-        switch (func) {
-        case PIPE_BLEND_ADD:
-                return qir_FADD(c, src, dst);
-        case PIPE_BLEND_SUBTRACT:
-                return qir_FSUB(c, src, dst);
-        case PIPE_BLEND_REVERSE_SUBTRACT:
-                return qir_FSUB(c, dst, src);
-        case PIPE_BLEND_MIN:
-                return qir_FMIN(c, src, dst);
-        case PIPE_BLEND_MAX:
-                return qir_FMAX(c, src, dst);
-
-        default:
-                /* Unsupported. */
-                fprintf(stderr, "Unknown blend func %d\n", func);
-                return src;
-
-        }
-}
-
-/**
- * Implements fixed function blending in shader code.
- *
- * VC4 doesn't have any hardware support for blending.  Instead, you read the
- * current contents of the destination from the tile buffer after having
- * waited for the scoreboard (which is handled by vc4_qpu_emit.c), then do
- * math using your output color and that destination value, and update the
- * output color appropriately.
- */
-static void
-vc4_blend(struct vc4_compile *c, struct qreg *result,
-          struct qreg *dst_color, struct qreg *src_color)
-{
-        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
-
-        if (!blend->blend_enable) {
-                for (int i = 0; i < 4; i++)
-                        result[i] = src_color[i];
-                return;
-        }
-
-        struct qreg clamped_src[4];
-        struct qreg clamped_dst[4];
-        for (int i = 0; i < 4; i++) {
-                clamped_src[i] = qir_SAT(c, src_color[i]);
-                clamped_dst[i] = qir_SAT(c, dst_color[i]);
-        }
-        src_color = clamped_src;
-        dst_color = clamped_dst;
-
-        struct qreg src_blend[4], dst_blend[4];
-        for (int i = 0; i < 3; i++) {
-                src_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 src_color[i],
-                                                 blend->rgb_src_factor, i);
-                dst_blend[i] = vc4_blend_channel(c,
-                                                 dst_color, src_color,
-                                                 dst_color[i],
-                                                 blend->rgb_dst_factor, i);
-        }
-        src_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         src_color[3],
-                                         blend->alpha_src_factor, 3);
-        dst_blend[3] = vc4_blend_channel(c,
-                                         dst_color, src_color,
-                                         dst_color[3],
-                                         blend->alpha_dst_factor, 3);
-
-        for (int i = 0; i < 3; i++) {
-                result[i] = vc4_blend_func(c,
-                                           src_blend[i], dst_blend[i],
-                                           blend->rgb_func);
-        }
-        result[3] = vc4_blend_func(c,
-                                   src_blend[3], dst_blend[3],
-                                   blend->alpha_func);
-}
-
 static void
 clip_distance_discard(struct vc4_compile *c)
 {
@@ -1275,168 +1088,16 @@ clip_distance_discard(struct vc4_compile *c)
         }
 }
 
-static void
-alpha_test_discard(struct vc4_compile *c)
-{
-        struct qreg src_alpha;
-        struct qreg alpha_ref = qir_uniform(c, QUNIFORM_ALPHA_REF, 0);
-
-        if (!c->fs_key->alpha_test)
-                return;
-
-        if (c->output_color_index != -1)
-                src_alpha = c->outputs[c->output_color_index + 3];
-        else
-                src_alpha = qir_uniform_f(c, 1.0);
-
-        if (c->discard.file == QFILE_NULL)
-                c->discard = qir_uniform_ui(c, 0);
-
-        switch (c->fs_key->alpha_test_func) {
-        case PIPE_FUNC_NEVER:
-                c->discard = qir_uniform_ui(c, ~0);
-                break;
-        case PIPE_FUNC_ALWAYS:
-                break;
-        case PIPE_FUNC_EQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_NOTEQUAL:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_ZC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GREATER:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_GEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LESS:
-                qir_SF(c, qir_FSUB(c, src_alpha, alpha_ref));
-                c->discard = qir_SEL_X_Y_NS(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        case PIPE_FUNC_LEQUAL:
-                qir_SF(c, qir_FSUB(c, alpha_ref, src_alpha));
-                c->discard = qir_SEL_X_Y_NC(c, c->discard,
-                                            qir_uniform_ui(c, ~0));
-                break;
-        }
-}
-
-static struct qreg
-vc4_logicop(struct vc4_compile *c, struct qreg src, struct qreg dst)
-{
-        switch (c->fs_key->logicop_func) {
-        case PIPE_LOGICOP_CLEAR:
-                return qir_uniform_f(c, 0.0);
-        case PIPE_LOGICOP_NOR:
-                return qir_NOT(c, qir_OR(c, src, dst));
-        case PIPE_LOGICOP_AND_INVERTED:
-                return qir_AND(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_COPY_INVERTED:
-                return qir_NOT(c, src);
-        case PIPE_LOGICOP_AND_REVERSE:
-                return qir_AND(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_INVERT:
-                return qir_NOT(c, dst);
-        case PIPE_LOGICOP_XOR:
-                return qir_XOR(c, src, dst);
-        case PIPE_LOGICOP_NAND:
-                return qir_NOT(c, qir_AND(c, src, dst));
-        case PIPE_LOGICOP_AND:
-                return qir_AND(c, src, dst);
-        case PIPE_LOGICOP_EQUIV:
-                return qir_NOT(c, qir_XOR(c, src, dst));
-        case PIPE_LOGICOP_NOOP:
-                return dst;
-        case PIPE_LOGICOP_OR_INVERTED:
-                return qir_OR(c, qir_NOT(c, src), dst);
-        case PIPE_LOGICOP_OR_REVERSE:
-                return qir_OR(c, src, qir_NOT(c, dst));
-        case PIPE_LOGICOP_OR:
-                return qir_OR(c, src, dst);
-        case PIPE_LOGICOP_SET:
-                return qir_uniform_ui(c, ~0);
-        case PIPE_LOGICOP_COPY:
-        default:
-                return src;
-        }
-}
-
 static void
 emit_frag_end(struct vc4_compile *c)
 {
         clip_distance_discard(c);
-        alpha_test_discard(c);
 
-        enum pipe_format color_format = c->fs_key->color_format;
-        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
-        struct qreg tlb_read_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg linear_dst_color[4] = { c->undef, c->undef, c->undef, c->undef };
-        struct qreg packed_dst_color = c->undef;
-
-        if (c->fs_key->blend.blend_enable ||
-            c->fs_key->blend.colormask != 0xf ||
-            c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                struct qreg r4 = qir_TLB_COLOR_READ(c);
-                for (int i = 0; i < 4; i++)
-                        tlb_read_color[i] = qir_R4_UNPACK(c, r4, i);
-                for (int i = 0; i < 4; i++) {
-                        dst_color[i] = get_swizzled_channel(c,
-                                                            tlb_read_color,
-                                                            format_swiz[i]);
-                        if (util_format_is_srgb(color_format) && i != 3) {
-                                linear_dst_color[i] =
-                                        qir_srgb_decode(c, dst_color[i]);
-                        } else {
-                                linear_dst_color[i] = dst_color[i];
-                        }
-                }
-
-                /* Save the packed value for logic ops.  Can't reuse r4
-                 * because other things might smash it (like sRGB)
-                 */
-                packed_dst_color = qir_MOV(c, r4);
-        }
-
-        struct qreg blend_color[4];
-        struct qreg undef_array[4] = {
-                c->undef, c->undef, c->undef, c->undef
-        };
-        vc4_blend(c, blend_color, linear_dst_color,
-                  (c->output_color_index != -1 ?
-                   c->outputs + c->output_color_index :
-                   undef_array));
-
-        if (util_format_is_srgb(color_format)) {
-                for (int i = 0; i < 3; i++)
-                        blend_color[i] = qir_srgb_encode(c, blend_color[i]);
-        }
-
-        /* Debug: Sometimes you're getting a black output and just want to see
-         * if the FS is getting executed at all.  Spam magenta into the color
-         * output.
-         */
-        if (0) {
-                blend_color[0] = qir_uniform_f(c, 1.0);
-                blend_color[1] = qir_uniform_f(c, 0.0);
-                blend_color[2] = qir_uniform_f(c, 1.0);
-                blend_color[3] = qir_uniform_f(c, 0.5);
-        }
-
-        struct qreg swizzled_outputs[4];
-        for (int i = 0; i < 4; i++) {
-                swizzled_outputs[i] = get_swizzled_channel(c, blend_color,
-                                                           format_swiz[i]);
+        struct qreg color;
+        if (c->output_color_index != -1) {
+                color = c->outputs[c->output_color_index];
+        } else {
+                color = qir_uniform_ui(c, 0);
         }
 
         if (c->discard.file != QFILE_NULL)
@@ -1463,47 +1124,7 @@ emit_frag_end(struct vc4_compile *c)
                 qir_TLB_Z_WRITE(c, z);
         }
 
-        struct qreg packed_color = c->undef;
-        for (int i = 0; i < 4; i++) {
-                if (swizzled_outputs[i].file == QFILE_NULL)
-                        continue;
-                if (packed_color.file == QFILE_NULL) {
-                        packed_color = qir_PACK_8888_F(c, swizzled_outputs[i]);
-                } else {
-                        packed_color = qir_PACK_8_F(c,
-                                                    packed_color,
-                                                    swizzled_outputs[i],
-                                                    i);
-                }
-        }
-
-        if (packed_color.file == QFILE_NULL)
-                packed_color = qir_uniform_ui(c, 0);
-
-        if (c->fs_key->logicop_func != PIPE_LOGICOP_COPY) {
-                packed_color = vc4_logicop(c, packed_color, packed_dst_color);
-        }
-
-        /* If the bit isn't set in the color mask, then just return the
-         * original dst color, instead.
-         */
-        uint32_t colormask = 0xffffffff;
-        for (int i = 0; i < 4; i++) {
-                if (format_swiz[i] < 4 &&
-                    !(c->fs_key->blend.colormask & (1 << format_swiz[i]))) {
-                        colormask &= ~(0xff << (i * 8));
-                }
-        }
-        if (colormask != 0xffffffff) {
-                packed_color = qir_OR(c,
-                                      qir_AND(c, packed_color,
-                                              qir_uniform_ui(c, colormask)),
-                                      qir_AND(c, packed_dst_color,
-                                              qir_uniform_ui(c, ~colormask)));
-        }
-
-        qir_emit(c, qir_inst(QOP_TLB_COLOR_WRITE, c->undef,
-                             packed_color, c->undef));
+        qir_TLB_COLOR_WRITE(c, color);
 }
 
 static void
@@ -1695,6 +1316,7 @@ vc4_optimize_nir(struct nir_shader *s)
                 progress = nir_opt_peephole_select(s) || progress;
                 progress = nir_opt_algebraic(s) || progress;
                 progress = nir_opt_constant_folding(s) || progress;
+                progress = nir_opt_undef(s) || progress;
         } while (progress);
 }
 
@@ -1736,6 +1358,7 @@ ntq_setup_inputs(struct vc4_compile *c)
                 unsigned loc = var->data.driver_location;
 
                 assert(array_len == 1);
+                (void)array_len;
                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
                                   (loc + 1) * 4);
 
@@ -1743,11 +1366,12 @@ ntq_setup_inputs(struct vc4_compile *c)
                         if (semantic_name == TGSI_SEMANTIC_POSITION) {
                                 emit_fragcoord_input(c, loc);
                         } else if (semantic_name == TGSI_SEMANTIC_FACE) {
-                                emit_face_input(c, loc);
+                                c->inputs[loc * 4 + 0] = qir_FRAG_REV_FLAG(c);
                         } else if (semantic_name == TGSI_SEMANTIC_GENERIC &&
                                    (c->fs_key->point_sprite_mask &
                                     (1 << semantic_index))) {
-                                emit_point_coord_input(c, loc);
+                                c->inputs[loc * 4 + 0] = c->point_x;
+                                c->inputs[loc * 4 + 1] = c->point_y;
                         } else {
                                 emit_fragment_input(c, loc,
                                                     semantic_name,
@@ -1770,6 +1394,13 @@ ntq_setup_outputs(struct vc4_compile *c)
                 unsigned loc = var->data.driver_location * 4;
 
                 assert(array_len == 1);
+                (void)array_len;
+
+                /* NIR hack to pass through
+                 * TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS */
+                if (semantic_name == TGSI_SEMANTIC_COLOR &&
+                    semantic_index == -1)
+                        semantic_index = 0;
 
                 for (int i = 0; i < 4; i++) {
                         add_output(c,
@@ -1834,14 +1465,25 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
 static void
 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
 {
-        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
-                                          instr->def.num_components);
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
                 qregs[i] = qir_uniform_ui(c, instr->value.u[i]);
 
         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
 }
 
+static void
+ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
+{
+        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
+
+        /* QIR needs there to be *some* value, so pick 0 (same as for
+         * ntq_setup_registers().
+         */
+        for (int i = 0; i < instr->def.num_components; i++)
+                qregs[i] = qir_uniform_ui(c, 0);
+}
+
 static void
 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 {
@@ -1849,41 +1491,41 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
         struct qreg *dest = NULL;
 
         if (info->has_dest) {
-                dest = ntq_get_dest(c, instr->dest);
+                dest = ntq_get_dest(c, &instr->dest);
         }
 
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
-                for (int i = 0; i < instr->num_components; i++) {
-                        dest[i] = qir_uniform(c, QUNIFORM_UNIFORM,
-                                              instr->const_index[0] * 4 + i);
+                assert(instr->num_components == 1);
+                if (instr->const_index[0] < VC4_NIR_STATE_UNIFORM_OFFSET) {
+                        *dest = qir_uniform(c, QUNIFORM_UNIFORM,
+                                            instr->const_index[0]);
+                } else {
+                        *dest = qir_uniform(c, instr->const_index[0] -
+                                            VC4_NIR_STATE_UNIFORM_OFFSET,
+                                            0);
                 }
                 break;
 
         case nir_intrinsic_load_uniform_indirect:
-                for (int i = 0; i < instr->num_components; i++) {
-                        dest[i] = indirect_uniform_load(c,
-                                                        ntq_get_src(c, instr->src[0], 0),
-                                                        (instr->const_index[0] *
-                                                         4 + i) * sizeof(float));
-                }
+                *dest = indirect_uniform_load(c, instr);
 
                 break;
 
         case nir_intrinsic_load_input:
-                for (int i = 0; i < instr->num_components; i++)
-                        dest[i] = c->inputs[instr->const_index[0] * 4 + i];
-
+                assert(instr->num_components == 1);
+                if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
+                        *dest = qir_TLB_COLOR_READ(c);
+                } else {
+                        *dest = c->inputs[instr->const_index[0]];
+                }
                 break;
 
         case nir_intrinsic_store_output:
-                for (int i = 0; i < instr->num_components; i++) {
-                        c->outputs[instr->const_index[0] * 4 + i] =
-                                qir_MOV(c, ntq_get_src(c, instr->src[0], i));
-                }
-                c->num_outputs = MAX2(c->num_outputs,
-                                      instr->const_index[0] * 4 +
-                                      instr->num_components + 1);
+                assert(instr->num_components == 1);
+                c->outputs[instr->const_index[0]] =
+                        qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
+                c->num_outputs = MAX2(c->num_outputs, instr->const_index[0] + 1);
                 break;
 
         case nir_intrinsic_discard:
@@ -1927,6 +1569,10 @@ ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
                 break;
 
+        case nir_instr_type_ssa_undef:
+                ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
+                break;
+
         case nir_instr_type_tex:
                 ntq_emit_tex(c, nir_instr_as_tex(instr));
                 break;
@@ -2084,13 +1730,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         c->s = tgsi_to_nir(tokens, &nir_options);
         nir_opt_global_to_local(c->s);
         nir_convert_to_ssa(c->s);
+        if (stage == QSTAGE_FRAG)
+                vc4_nir_lower_blend(c);
+        vc4_nir_lower_io(c);
         nir_lower_idiv(c->s);
+        nir_lower_load_const_to_scalar(c->s);
 
         vc4_optimize_nir(c->s);
 
         nir_remove_dead_variables(c->s);
 
-        nir_convert_from_ssa(c->s);
+        nir_convert_from_ssa(c->s, true);
 
         if (vc4_debug & VC4_DEBUG_SHADERDB) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
@@ -2187,6 +1837,8 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
         memcpy(uinfo->contents, c->uniform_contents,
                count * sizeof(*uinfo->contents));
         uinfo->num_texture_samples = c->num_texture_samples;
+
+        vc4_set_shader_uniform_dirty_flags(shader);
 }
 
 static struct vc4_compiled_shader *
@@ -2259,9 +1911,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         }
 
         copy_uniform_state_to_shader(shader, c);
-        shader->bo = vc4_bo_alloc_mem(vc4->screen, c->qpu_insts,
-                                      c->qpu_inst_count * sizeof(uint64_t),
-                                      "code");
+        shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
+                                         c->qpu_inst_count * sizeof(uint64_t));
 
         /* Copy the compiler UBO range state to the compiled shader, dropping
          * out arrays that were never referenced by an indirect load.
@@ -2288,10 +1939,12 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
                 }
         }
         if (shader->ubo_size) {
-                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
-                        qir_get_stage_name(c->stage),
-                        c->program_id, c->variant_id,
-                        shader->ubo_size / 4);
+                if (vc4_debug & VC4_DEBUG_SHADERDB) {
+                        fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
+                                qir_get_stage_name(c->stage),
+                                c->program_id, c->variant_id,
+                                shader->ubo_size / 4);
+                }
         }
 
         qir_compile_destroy(c);
@@ -2421,9 +2074,20 @@ vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
                 (prim_mode == PIPE_PRIM_POINTS &&
                  vc4->rasterizer->base.point_size_per_vertex);
 
-        vc4->prog.vs = vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
+        struct vc4_compiled_shader *vs =
+                vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
+        if (vs != vc4->prog.vs) {
+                vc4->prog.vs = vs;
+                vc4->dirty |= VC4_DIRTY_COMPILED_VS;
+        }
+
         key->is_coord = true;
-        vc4->prog.cs = vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
+        struct vc4_compiled_shader *cs =
+                vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
+        if (cs != vc4->prog.cs) {
+                vc4->prog.cs = cs;
+                vc4->dirty |= VC4_DIRTY_COMPILED_CS;
+        }
 }
 
 void
@@ -2490,305 +2154,6 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
         free(so);
 }
 
-static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
-{
-        switch (p_wrap) {
-        case PIPE_TEX_WRAP_REPEAT:
-                return 0;
-        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-                return 1;
-        case PIPE_TEX_WRAP_MIRROR_REPEAT:
-                return 2;
-        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-                return 3;
-        case PIPE_TEX_WRAP_CLAMP:
-                return (using_nearest ? 1 : 3);
-        default:
-                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
-                assert(!"not reached");
-                return 0;
-        }
-}
-
-static void
-write_texture_p0(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t unit)
-{
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-
-        cl_reloc(vc4, &vc4->uniforms, rsc->bo,
-                 VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
-                 VC4_SET_FIELD(texture->u.tex.last_level -
-                               texture->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
-                 VC4_SET_FIELD(texture->target == PIPE_TEXTURE_CUBE,
-                               VC4_TEX_P0_CMMODE) |
-                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE));
-}
-
-static void
-write_texture_p1(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t unit)
-{
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        static const uint8_t minfilter_map[6] = {
-                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
-                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
-                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
-                VC4_TEX_P1_MINFILT_NEAREST,
-                VC4_TEX_P1_MINFILT_LINEAR,
-        };
-        static const uint32_t magfilter_map[] = {
-                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
-                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
-        };
-
-        bool either_nearest =
-                (sampler->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
-                 sampler->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
-
-        cl_aligned_u32(&vc4->uniforms,
-               VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
-               VC4_SET_FIELD(texture->texture->height0 & 2047,
-                             VC4_TEX_P1_HEIGHT) |
-               VC4_SET_FIELD(texture->texture->width0 & 2047,
-                             VC4_TEX_P1_WIDTH) |
-               VC4_SET_FIELD(magfilter_map[sampler->mag_img_filter],
-                             VC4_TEX_P1_MAGFILT) |
-               VC4_SET_FIELD(minfilter_map[sampler->min_mip_filter * 2 +
-                                           sampler->min_img_filter],
-                             VC4_TEX_P1_MINFILT) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_s, either_nearest),
-                             VC4_TEX_P1_WRAP_S) |
-               VC4_SET_FIELD(translate_wrap(sampler->wrap_t, either_nearest),
-                             VC4_TEX_P1_WRAP_T));
-}
-
-static void
-write_texture_p2(struct vc4_context *vc4,
-                 struct vc4_texture_stateobj *texstate,
-                 uint32_t data)
-{
-        uint32_t unit = data & 0xffff;
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-
-        cl_aligned_u32(&vc4->uniforms,
-               VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
-                             VC4_TEX_P2_PTYPE) |
-               VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
-               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
-}
-
-
-#define SWIZ(x,y,z,w) {          \
-        UTIL_FORMAT_SWIZZLE_##x, \
-        UTIL_FORMAT_SWIZZLE_##y, \
-        UTIL_FORMAT_SWIZZLE_##z, \
-        UTIL_FORMAT_SWIZZLE_##w  \
-}
-
-static void
-write_texture_border_color(struct vc4_context *vc4,
-                           struct vc4_texture_stateobj *texstate,
-                           uint32_t unit)
-{
-        struct pipe_sampler_state *sampler = texstate->samplers[unit];
-        struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc4_resource *rsc = vc4_resource(texture->texture);
-        union util_color uc;
-
-        const struct util_format_description *tex_format_desc =
-                util_format_description(texture->format);
-
-        float border_color[4];
-        for (int i = 0; i < 4; i++)
-                border_color[i] = sampler->border_color.f[i];
-        if (util_format_is_srgb(texture->format)) {
-                for (int i = 0; i < 3; i++)
-                        border_color[i] =
-                                util_format_linear_to_srgb_float(border_color[i]);
-        }
-
-        /* Turn the border color into the layout of channels that it would
-         * have when stored as texture contents.
-         */
-        float storage_color[4];
-        util_format_unswizzle_4f(storage_color,
-                                 border_color,
-                                 tex_format_desc->swizzle);
-
-        /* Now, pack so that when the vc4_format-sampled texture contents are
-         * replaced with our border color, the vc4_get_format_swizzle()
-         * swizzling will get the right channels.
-         */
-        if (util_format_is_depth_or_stencil(texture->format)) {
-                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
-                                       sampler->border_color.f[0]) << 8;
-        } else {
-                switch (rsc->vc4_format) {
-                default:
-                case VC4_TEXTURE_TYPE_RGBA8888:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_RGBA4444:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_RGB565:
-                        util_pack_color(storage_color,
-                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
-                        break;
-                case VC4_TEXTURE_TYPE_ALPHA:
-                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
-                        break;
-                case VC4_TEXTURE_TYPE_LUMALPHA:
-                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
-                                    (float_to_ubyte(storage_color[0]) << 0));
-                        break;
-                }
-        }
-
-        cl_aligned_u32(&vc4->uniforms, uc.ui[0]);
-}
-
-static uint32_t
-get_texrect_scale(struct vc4_texture_stateobj *texstate,
-                  enum quniform_contents contents,
-                  uint32_t data)
-{
-        struct pipe_sampler_view *texture = texstate->textures[data];
-        uint32_t dim;
-
-        if (contents == QUNIFORM_TEXRECT_SCALE_X)
-                dim = texture->texture->width0;
-        else
-                dim = texture->texture->height0;
-
-        return fui(1.0f / dim);
-}
-
-static struct vc4_bo *
-vc4_upload_ubo(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
-               const uint32_t *gallium_uniforms)
-{
-        if (!shader->ubo_size)
-                return NULL;
-
-        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
-        uint32_t *data = vc4_bo_map(ubo);
-        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
-                memcpy(data + shader->ubo_ranges[i].dst_offset,
-                       gallium_uniforms + shader->ubo_ranges[i].src_offset,
-                       shader->ubo_ranges[i].size);
-        }
-
-        return ubo;
-}
-
-void
-vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
-                   struct vc4_constbuf_stateobj *cb,
-                   struct vc4_texture_stateobj *texstate)
-{
-        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
-        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
-        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
-
-        cl_ensure_space(&vc4->uniforms, (uinfo->count +
-                                         uinfo->num_texture_samples) * 4);
-
-        cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples);
-
-        for (int i = 0; i < uinfo->count; i++) {
-
-                switch (uinfo->contents[i]) {
-                case QUNIFORM_CONSTANT:
-                        cl_aligned_u32(&vc4->uniforms, uinfo->data[i]);
-                        break;
-                case QUNIFORM_UNIFORM:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       gallium_uniforms[uinfo->data[i]]);
-                        break;
-                case QUNIFORM_VIEWPORT_X_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[0] * 16.0f);
-                        break;
-                case QUNIFORM_VIEWPORT_Y_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[1] * 16.0f);
-                        break;
-
-                case QUNIFORM_VIEWPORT_Z_OFFSET:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.translate[2]);
-                        break;
-                case QUNIFORM_VIEWPORT_Z_SCALE:
-                        cl_aligned_f(&vc4->uniforms, vc4->viewport.scale[2]);
-                        break;
-
-                case QUNIFORM_USER_CLIP_PLANE:
-                        cl_aligned_f(&vc4->uniforms,
-                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P0:
-                        write_texture_p0(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P1:
-                        write_texture_p1(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXTURE_CONFIG_P2:
-                        write_texture_p2(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_UBO_ADDR:
-                        cl_aligned_reloc(vc4, &vc4->uniforms, ubo, 0);
-                        break;
-
-                case QUNIFORM_TEXTURE_BORDER_COLOR:
-                        write_texture_border_color(vc4, texstate, uinfo->data[i]);
-                        break;
-
-                case QUNIFORM_TEXRECT_SCALE_X:
-                case QUNIFORM_TEXRECT_SCALE_Y:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       get_texrect_scale(texstate,
-                                                         uinfo->contents[i],
-                                                         uinfo->data[i]));
-                        break;
-
-                case QUNIFORM_BLEND_CONST_COLOR:
-                        cl_aligned_f(&vc4->uniforms,
-                                     CLAMP(vc4->blend_color.color[uinfo->data[i]], 0, 1));
-                        break;
-
-                case QUNIFORM_STENCIL:
-                        cl_aligned_u32(&vc4->uniforms,
-                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
-                                       (uinfo->data[i] <= 1 ?
-                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
-                                        0));
-                        break;
-
-                case QUNIFORM_ALPHA_REF:
-                        cl_aligned_f(&vc4->uniforms,
-                                     vc4->zsa->base.alpha.ref_value);
-                        break;
-                }
-#if 0
-                uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4);
-                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
-                        shader, i, written_val, uif(written_val));
-#endif
-        }
-}
-
 static void
 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 1c96ef4795f..254140a72f5 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -96,10 +96,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TEX_B] = { "tex_b", 0, 2 },
         [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
         [QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
-        [QOP_R4_UNPACK_A] = { "r4_unpack_a", 1, 1 },
-        [QOP_R4_UNPACK_B] = { "r4_unpack_b", 1, 1 },
-        [QOP_R4_UNPACK_C] = { "r4_unpack_c", 1, 1 },
-        [QOP_R4_UNPACK_D] = { "r4_unpack_d", 1, 1 },
         [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 },
         [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 },
         [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 },
@@ -234,20 +230,6 @@ qir_writes_r4(struct qinst *inst)
         }
 }
 
-bool
-qir_reads_r4(struct qinst *inst)
-{
-        switch (inst->op) {
-        case QOP_R4_UNPACK_A:
-        case QOP_R4_UNPACK_B:
-        case QOP_R4_UNPACK_C:
-        case QOP_R4_UNPACK_D:
-                return true;
-        default:
-                return false;
-        }
-}
-
 static void
 qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 732cfd0b306..cade795c12a 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -36,6 +36,11 @@
 #include "util/list.h"
 #include "util/u_math.h"
 
+#include "vc4_screen.h"
+#include "pipe/p_state.h"
+
+struct nir_builder;
+
 enum qfile {
         QFILE_NULL,
         QFILE_TEMP,
@@ -155,10 +160,6 @@ enum qop {
          * the destination
          */
         QOP_TEX_RESULT,
-        QOP_R4_UNPACK_A,
-        QOP_R4_UNPACK_B,
-        QOP_R4_UNPACK_C,
-        QOP_R4_UNPACK_D
 };
 
 struct queued_qpu_inst {
@@ -243,7 +244,11 @@ enum quniform_contents {
 
         QUNIFORM_TEXTURE_BORDER_COLOR,
 
-        QUNIFORM_BLEND_CONST_COLOR,
+        QUNIFORM_BLEND_CONST_COLOR_X,
+        QUNIFORM_BLEND_CONST_COLOR_Y,
+        QUNIFORM_BLEND_CONST_COLOR_Z,
+        QUNIFORM_BLEND_CONST_COLOR_W,
+
         QUNIFORM_STENCIL,
 
         QUNIFORM_ALPHA_REF,
@@ -280,6 +285,52 @@ struct vc4_compiler_ubo_range {
         bool used;
 };
 
+struct vc4_key {
+        struct vc4_uncompiled_shader *shader_state;
+        struct {
+                enum pipe_format format;
+                unsigned compare_mode:1;
+                unsigned compare_func:3;
+                unsigned wrap_s:3;
+                unsigned wrap_t:3;
+                uint8_t swizzle[4];
+        } tex[VC4_MAX_TEXTURE_SAMPLERS];
+        uint8_t ucp_enables;
+};
+
+struct vc4_fs_key {
+        struct vc4_key base;
+        enum pipe_format color_format;
+        bool depth_enabled;
+        bool stencil_enabled;
+        bool stencil_twoside;
+        bool stencil_full_writemasks;
+        bool is_points;
+        bool is_lines;
+        bool alpha_test;
+        bool point_coord_upper_left;
+        bool light_twoside;
+        uint8_t alpha_test_func;
+        uint8_t logicop_func;
+        uint32_t point_sprite_mask;
+
+        struct pipe_rt_blend_state blend;
+};
+
+struct vc4_vs_key {
+        struct vc4_key base;
+
+        /**
+         * This is a proxy for the array of FS input semantics, which is
+         * larger than we would want to put in the key.
+         */
+        uint64_t compiled_fs_id;
+
+        enum pipe_format attr_formats[8];
+        bool is_coord;
+        bool per_vertex_point_size;
+};
+
 struct vc4_compile {
         struct vc4_context *vc4;
         nir_shader *s;
@@ -369,6 +420,16 @@ struct vc4_compile {
         uint32_t variant_id;
 };
 
+/* Special nir_load_input intrinsic index for loading the current TLB
+ * destination color.
+ */
+#define VC4_NIR_TLB_COLOR_READ_INPUT		2000000000
+
+/* Special offset for nir_load_uniform values to get a QUNIFORM_*
+ * state-dependent value.
+ */
+#define VC4_NIR_STATE_UNIFORM_OFFSET		2000000000
+
 struct vc4_compile *qir_compile_init(void);
 void qir_compile_destroy(struct vc4_compile *c);
 struct qinst *qir_inst(enum qop op, struct qreg dst,
@@ -393,7 +454,6 @@ bool qir_is_multi_instruction(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
-bool qir_reads_r4(struct qinst *inst);
 bool qir_src_needs_a_file(struct qinst *inst);
 struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
 
@@ -409,6 +469,12 @@ bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm_writes(struct vc4_compile *c);
+void vc4_nir_lower_blend(struct vc4_compile *c);
+void vc4_nir_lower_io(struct vc4_compile *c);
+nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
+                                       enum quniform_contents contents);
+nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
+                                          nir_ssa_def **srcs, int swiz);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 void qpu_schedule_instructions(struct vc4_compile *c);
@@ -523,26 +589,11 @@ QIR_ALU0(FRAG_W)
 QIR_ALU0(FRAG_REV_FLAG)
 QIR_ALU0(TEX_RESULT)
 QIR_ALU0(TLB_COLOR_READ)
+QIR_NODST_1(TLB_COLOR_WRITE)
 QIR_NODST_1(TLB_Z_WRITE)
 QIR_NODST_1(TLB_DISCARD_SETUP)
 QIR_NODST_1(TLB_STENCIL_SETUP)
 
-static inline struct qreg
-qir_R4_UNPACK(struct vc4_compile *c, struct qreg r4, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, r4, c->undef));
-        return t;
-}
-
-static inline struct qreg
-qir_SEL_X_0_COND(struct vc4_compile *c, int i)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_R4_UNPACK_A + i, t, c->undef, c->undef));
-        return t;
-}
-
 static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 910c89dca79..f087c3b81b5 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -52,7 +52,7 @@ static void
 add_uniform(struct hash_table *ht, struct qreg reg)
 {
         struct hash_entry *entry;
-        void *key = (void *)(uintptr_t)reg.index;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
 
         entry = _mesa_hash_table_search(ht, key);
         if (entry) {
@@ -66,7 +66,7 @@ static void
 remove_uniform(struct hash_table *ht, struct qreg reg)
 {
         struct hash_entry *entry;
-        void *key = (void *)(uintptr_t)reg.index;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
 
         entry = _mesa_hash_table_search(ht, key);
         assert(entry);
@@ -122,7 +122,7 @@ qir_lower_uniforms(struct vc4_compile *c)
                 struct hash_entry *entry;
                 hash_table_foreach(ht, entry) {
                         uint32_t count = (uintptr_t)entry->data;
-                        uint32_t index = (uintptr_t)entry->key;
+                        uint32_t index = (uintptr_t)entry->key - 1;
                         if (count > max_count) {
                                 max_count = count;
                                 max_index = index;
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index c9ab6344589..fbb90ba12a0 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -122,23 +122,23 @@ static inline struct qpu_reg qpu_r3(void) { return qpu_rn(3); }
 static inline struct qpu_reg qpu_r4(void) { return qpu_rn(4); }
 static inline struct qpu_reg qpu_r5(void) { return qpu_rn(5); }
 
-uint64_t qpu_NOP(void);
-uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src);
-uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src);
+uint64_t qpu_NOP(void) ATTRIBUTE_CONST;
+uint64_t qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST;
+uint64_t qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src) ATTRIBUTE_CONST;
 uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst,
-                    struct qpu_reg src0, struct qpu_reg src1);
+                    struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST;
 uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst,
-                    struct qpu_reg src0, struct qpu_reg src1);
-uint64_t qpu_merge_inst(uint64_t a, uint64_t b);
-uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
-uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
-uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond);
-uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond);
-uint32_t qpu_encode_small_immediate(uint32_t i);
+                    struct qpu_reg src0, struct qpu_reg src1) ATTRIBUTE_CONST;
+uint64_t qpu_merge_inst(uint64_t a, uint64_t b) ATTRIBUTE_CONST;
+uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val) ATTRIBUTE_CONST;
+uint64_t qpu_set_sig(uint64_t inst, uint32_t sig) ATTRIBUTE_CONST;
+uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
+uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST;
+uint32_t qpu_encode_small_immediate(uint32_t i) ATTRIBUTE_CONST;
 
-bool qpu_waddr_is_tlb(uint32_t waddr);
-bool qpu_inst_is_tlb(uint64_t inst);
-int qpu_num_sf_accesses(uint64_t inst);
+bool qpu_waddr_is_tlb(uint32_t waddr) ATTRIBUTE_CONST;
+bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST;
+int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST;
 void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst);
 
 static inline uint64_t
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 55e0e6139b5..00aeb300a9b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -225,7 +225,7 @@ static const char *qpu_condflags[] = {
 };
 
 #define DESC(array, index)                                        \
-        ((index > ARRAY_SIZE(array) || !(array)[index]) ?         \
+        ((index >= ARRAY_SIZE(array) || !(array)[index]) ?         \
          "???" : (array)[index])
 
 static const char *
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 99afe4b8798..f324056258c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -234,6 +234,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         case QFILE_VPM:
                                 assert((int)qinst->src[i].index >=
                                        last_vpm_read_index);
+                                (void)last_vpm_read_index;
                                 last_vpm_read_index = qinst->src[i].index;
                                 src[i] = qpu_ra(QPU_R_VPM);
                                 break;
@@ -319,7 +320,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 abort();
                         }
 
-                        queue(c, qpu_a_MOV(dst, qpu_r4()));
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
 
                         break;
 
@@ -402,6 +404,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_COLOR_LOAD);
 
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_TLB_COLOR_WRITE:
@@ -451,21 +455,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         queue(c, qpu_NOP());
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_LOAD_TMU0);
-
-                        break;
-
-                case QOP_R4_UNPACK_A:
-                case QOP_R4_UNPACK_B:
-                case QOP_R4_UNPACK_C:
-                case QOP_R4_UNPACK_D:
-                        assert(src[0].mux == QPU_MUX_R4);
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        *last_inst(c) |= QPU_PM;
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
-                                                       (qinst->op -
-                                                        QOP_R4_UNPACK_A),
-                                                       QPU_UNPACK);
-
+                        if (dst.mux != QPU_MUX_R4)
+                                queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
                 case QOP_UNPACK_8A_F:
@@ -474,20 +465,30 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_UNPACK_8D_F:
                 case QOP_UNPACK_16A_F:
                 case QOP_UNPACK_16B_F: {
-                        assert(src[0].mux == QPU_MUX_A);
+                        if (src[0].mux == QPU_MUX_R4) {
+                                queue(c, qpu_a_MOV(dst, src[0]));
+                                *last_inst(c) |= QPU_PM;
+                                *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
+                                                               (qinst->op -
+                                                                QOP_UNPACK_8A_F),
+                                                               QPU_UNPACK);
+                        } else {
+                                assert(src[0].mux == QPU_MUX_A);
 
-                        /* Since we're setting the pack bits, if the
-                         * destination is in A it would get re-packed.
-                         */
-                        queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
-                                             qpu_rb(31) : dst),
-                                            src[0], src[0]));
-                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                  QOP_UNPACK_8A_F],
-                                                       QPU_UNPACK);
+                                /* Since we're setting the pack bits, if the
+                                 * destination is in A it would get re-packed.
+                                 */
+                                queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
+                                                     qpu_rb(31) : dst),
+                                                    src[0], src[0]));
+                                *last_inst(c) |=
+                                        QPU_SET_FIELD(unpack_map[qinst->op -
+                                                                 QOP_UNPACK_8A_F],
+                                                      QPU_UNPACK);
 
-                        if (dst.mux == QPU_MUX_A) {
-                                queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                if (dst.mux == QPU_MUX_A) {
+                                        queue(c, qpu_a_MOV(dst, qpu_rb(31)));
+                                }
                         }
                 }
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_validate.c b/src/gallium/drivers/vc4/vc4_qpu_validate.c
index 8471edbf62c..9cf6841f41c 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_validate.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_validate.c
@@ -23,6 +23,13 @@
 
 #include "vc4_qpu.h"
 
+#ifdef NDEBUG
+/* Since most of our code is used in assert()s, don't warn about dead code. */
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
 static bool
 writes_reg(uint64_t inst, uint32_t w)
 {
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 3b0b890b66a..a29db1f3abe 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -116,6 +116,8 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
         vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
                 /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
                  * vc4_qpu_emit.c
@@ -126,15 +128,18 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
                 /* R4 can't be written as a general purpose register. (it's
                  * TMU_NOSWAP as a write address).
                  */
-                if (vc4_regs[i].mux == QPU_MUX_R4)
+                if (vc4_regs[i].mux == QPU_MUX_R4) {
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
                         continue;
+                }
 
                 ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
         }
 
-        vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
-        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2)
+        for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
                 ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+        }
 
         ra_set_finalize(vc4->regs, NULL);
 }
@@ -153,6 +158,10 @@ node_to_temp_priority(const void *in_a, const void *in_b)
         return a->priority - b->priority;
 }
 
+#define CLASS_BIT_A			(1 << 0)
+#define CLASS_BIT_B_OR_ACC		(1 << 1)
+#define CLASS_BIT_R4			(1 << 2)
+
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
  *
@@ -165,6 +174,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t temp_to_node[c->num_temps];
         uint32_t def[c->num_temps];
         uint32_t use[c->num_temps];
+        uint8_t class_bits[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
         memset(def, 0, sizeof(def));
@@ -181,10 +191,6 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
                                                          c->num_temps);
 
-        for (uint32_t i = 0; i < c->num_temps; i++) {
-                ra_set_node_class(g, i, vc4->reg_class_any);
-        }
-
         /* Compute the live ranges so we can figure out interference.
          */
         uint32_t ip = 0;
@@ -223,8 +229,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 temp_to_node[map[i].temp] = i;
         }
 
-        /* Figure out our register classes and preallocated registers*/
+        /* Figure out our register classes and preallocated registers.  We
+         * start with any temp being able to be in any file, then instructions
+         * incrementally remove bits that the temp definitely can't be in.
+         */
+        memset(class_bits,
+               CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+               sizeof(class_bits));
+
+        ip = 0;
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+                if (qir_writes_r4(inst)) {
+                        /* This instruction writes r4 (and optionally moves
+                         * its result to a temp), so nothing else can be
+                         * stored in r4 across it.
+                         */
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (def[i] < ip && use[i] > ip)
+                                        class_bits[i] &= ~CLASS_BIT_R4;
+                        }
+                } else {
+                        /* R4 can't be written as a general purpose
+                         * register. (it's TMU_NOSWAP as a write address).
+                         */
+                        if (inst->dst.file == QFILE_TEMP)
+                                class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
+                }
+
                 switch (inst->op) {
                 case QOP_FRAG_Z:
                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
@@ -236,17 +267,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                         break;
 
-                case QOP_TEX_RESULT:
-                case QOP_TLB_COLOR_READ:
-                        assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
-                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
-                                        ACC_INDEX + 4);
-                        break;
-
                 case QOP_PACK_SCALED:
                         /* The pack flags require an A-file dst register. */
-                        ra_set_node_class(g, temp_to_node[inst->dst.index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->dst.index] &= CLASS_BIT_A;
                         break;
 
                 default:
@@ -254,8 +277,30 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 }
 
                 if (qir_src_needs_a_file(inst)) {
-                        ra_set_node_class(g, temp_to_node[inst->src[0].index],
-                                          vc4->reg_class_a);
+                        class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                }
+                ip++;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                int node = temp_to_node[i];
+
+                switch (class_bits[i]) {
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+                        ra_set_node_class(g, node, vc4->reg_class_any);
+                        break;
+                case CLASS_BIT_A | CLASS_BIT_R4:
+                        ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+                        break;
+                case CLASS_BIT_A:
+                        ra_set_node_class(g, node, vc4->reg_class_a);
+                        break;
+                default:
+                        fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
+                                i, class_bits[i]);
+                        abort();
+                        break;
                 }
         }
 
@@ -270,7 +315,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         }
 
         bool ok = ra_allocate(g);
-        assert(ok);
+        if (!ok) {
+                fprintf(stderr, "Failed to register allocate:\n");
+                qir_dump(c);
+                abort();
+        }
 
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index cab76406055..5d5166fd818 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -102,6 +102,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
 
         if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
                 vc4_resource_bo_alloc(rsc);
+
+                /* If it might be bound as one of our vertex buffers, make
+                 * sure we re-emit vertex buffer state.
+                 */
+                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                        vc4->dirty |= VC4_DIRTY_VTXBUF;
         } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
                 if (vc4_cl_references_bo(pctx, rsc->bo)) {
                         if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
@@ -110,6 +116,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                             prsc->height0 == box->height &&
                             prsc->depth0 == box->depth) {
                                 vc4_resource_bo_alloc(rsc);
+                                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                                        vc4->dirty |= VC4_DIRTY_VTXBUF;
                         } else {
                                 vc4_flush(pctx);
                         }
diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h
index ab8f5d3cd55..87571b75e8b 100644
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -82,19 +82,19 @@ struct vc4_resource {
         struct pipe_resource *shadow_parent;
 };
 
-static INLINE struct vc4_resource *
+static inline struct vc4_resource *
 vc4_resource(struct pipe_resource *prsc)
 {
         return (struct vc4_resource *)prsc;
 }
 
-static INLINE struct vc4_surface *
+static inline struct vc4_surface *
 vc4_surface(struct pipe_surface *psurf)
 {
         return (struct vc4_surface *)psurf;
 }
 
-static INLINE struct vc4_transfer *
+static inline struct vc4_transfer *
 vc4_transfer(struct pipe_transfer *ptrans)
 {
         return (struct vc4_transfer *)ptrans;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index f63bead0fbb..2dee1d40e5f 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -176,6 +176,10 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
         case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+	case PIPE_CAP_DEPTH_BOUNDS_TEST:
                 return 0;
 
                 /* Stream output. */
@@ -489,6 +493,12 @@ vc4_screen_bo_get_handle(struct pipe_screen *pscreen,
 {
         whandle->stride = stride;
 
+        /* If we're passing some reference to our BO out to some other part of
+         * the system, then we can't do any optimizations about only us being
+         * the ones seeing it (like BO caching or shadow update avoidance).
+         */
+        bo->private = false;
+
         switch (whandle->type) {
         case DRM_API_HANDLE_TYPE_SHARED:
                 return vc4_bo_flink(bo, &whandle->handle);
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index b58013dd2ee..7cfd236349d 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -74,11 +74,12 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
         struct vc4_bo **bos = vc4->bo_pointers.base;
 
         exec->bo_count = args->bo_handle_count;
-        exec->bo = calloc(exec->bo_count, sizeof(struct vc4_bo_exec_state));
+        exec->bo = calloc(exec->bo_count, sizeof(void *));
         for (int i = 0; i < exec->bo_count; i++) {
                 struct vc4_bo *bo = bos[i];
                 struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
 
+                struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
 #if 0
                 fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
 #endif
@@ -86,7 +87,16 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_exec_info *exec)
                 vc4_bo_map(bo);
                 memcpy(obj->vaddr, bo->map, bo->size);
 
-                exec->bo[i].bo = obj;
+                exec->bo[i] = obj;
+
+                /* The kernel does this validation at shader create ioctl
+                 * time.
+                 */
+                if (strcmp(bo->name, "code") == 0) {
+                        drm_bo->validated_shader = vc4_validate_shader(obj);
+                        if (!drm_bo->validated_shader)
+                                abort();
+                }
         }
         return 0;
 }
@@ -95,7 +105,7 @@ static int
 vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
 {
         for (int i = 0; i < exec->bo_count; i++) {
-                struct drm_gem_cma_object *obj = exec->bo[i].bo;
+                struct drm_gem_cma_object *obj = exec->bo[i];
                 struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo;
 
                 memcpy(bo->map, obj->vaddr, bo->size);
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 2bb36b253bb..68ace0216aa 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -78,6 +78,7 @@ struct drm_gem_cma_object {
 struct drm_vc4_bo {
         struct drm_gem_cma_object base;
         struct vc4_bo *bo;
+        struct vc4_validated_shader_info *validated_shader;
         struct list_head unref_head;
 };
 
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 4a1d4c3a4d6..8a759c2ca4c 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -107,7 +107,7 @@ vc4_create_rasterizer_state(struct pipe_context *pctx,
         /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
          * BCM21553).
          */
-        so->point_size = MAX2(cso->point_size, .125);
+        so->point_size = MAX2(cso->point_size, .125f);
 
         if (cso->front_ccw)
                 so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES;
@@ -461,11 +461,64 @@ vc4_get_stage_tex(struct vc4_context *vc4, unsigned shader)
         }
 }
 
+static uint32_t translate_wrap(uint32_t p_wrap, bool using_nearest)
+{
+        switch (p_wrap) {
+        case PIPE_TEX_WRAP_REPEAT:
+                return 0;
+        case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+                return 1;
+        case PIPE_TEX_WRAP_MIRROR_REPEAT:
+                return 2;
+        case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+                return 3;
+        case PIPE_TEX_WRAP_CLAMP:
+                return (using_nearest ? 1 : 3);
+        default:
+                fprintf(stderr, "Unknown wrap mode %d\n", p_wrap);
+                assert(!"not reached");
+                return 0;
+        }
+}
+
 static void *
 vc4_create_sampler_state(struct pipe_context *pctx,
                          const struct pipe_sampler_state *cso)
 {
-        return vc4_generic_cso_state_create(cso, sizeof(*cso));
+        static const uint8_t minfilter_map[6] = {
+                VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_LIN_MIP_NEAR,
+                VC4_TEX_P1_MINFILT_NEAR_MIP_LIN,
+                VC4_TEX_P1_MINFILT_LIN_MIP_LIN,
+                VC4_TEX_P1_MINFILT_NEAREST,
+                VC4_TEX_P1_MINFILT_LINEAR,
+        };
+        static const uint32_t magfilter_map[] = {
+                [PIPE_TEX_FILTER_NEAREST] = VC4_TEX_P1_MAGFILT_NEAREST,
+                [PIPE_TEX_FILTER_LINEAR] = VC4_TEX_P1_MAGFILT_LINEAR,
+        };
+        bool either_nearest =
+                (cso->mag_img_filter == PIPE_TEX_MIPFILTER_NEAREST ||
+                 cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
+        struct vc4_sampler_state *so = CALLOC_STRUCT(vc4_sampler_state);
+
+        if (!so)
+                return NULL;
+
+        memcpy(so, cso, sizeof(*cso));
+
+        so->texture_p1 =
+                (VC4_SET_FIELD(magfilter_map[cso->mag_img_filter],
+                               VC4_TEX_P1_MAGFILT) |
+                 VC4_SET_FIELD(minfilter_map[cso->min_mip_filter * 2 +
+                                             cso->min_img_filter],
+                               VC4_TEX_P1_MINFILT) |
+                 VC4_SET_FIELD(translate_wrap(cso->wrap_s, either_nearest),
+                               VC4_TEX_P1_WRAP_S) |
+                 VC4_SET_FIELD(translate_wrap(cso->wrap_t, either_nearest),
+                               VC4_TEX_P1_WRAP_T));
+
+        return so;
 }
 
 static void
@@ -499,13 +552,13 @@ static struct pipe_sampler_view *
 vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                         const struct pipe_sampler_view *cso)
 {
-        struct pipe_sampler_view *so = malloc(sizeof(*so));
+        struct vc4_sampler_view *so = malloc(sizeof(*so));
         struct vc4_resource *rsc = vc4_resource(prsc);
 
         if (!so)
                 return NULL;
 
-        *so = *cso;
+        so->base = *cso;
 
         pipe_reference(NULL, &prsc->reference);
 
@@ -516,18 +569,19 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
          * Also, Raspberry Pi doesn't support sampling from raster textures,
          * so we also have to copy to a temporary then.
          */
-        if (so->u.tex.first_level ||
+        if (cso->u.tex.first_level ||
             rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) {
                 struct vc4_resource *shadow_parent = vc4_resource(prsc);
                 struct pipe_resource tmpl = shadow_parent->base.b;
                 struct vc4_resource *clone;
 
                 tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
-                tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level);
-                tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level);
-                tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level;
+                tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level);
+                tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level);
+                tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level;
 
                 prsc = vc4_resource_create(pctx->screen, &tmpl);
+                rsc = vc4_resource(prsc);
                 clone = vc4_resource(prsc);
                 clone->shadow_parent = &shadow_parent->base.b;
                 /* Flag it as needing update of the contents from the parent. */
@@ -535,11 +589,23 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
 
                 assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R);
         }
-        so->texture = prsc;
-        so->reference.count = 1;
-        so->context = pctx;
+        so->base.texture = prsc;
+        so->base.reference.count = 1;
+        so->base.context = pctx;
 
-        return so;
+        so->texture_p0 =
+                (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
+                 VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) |
+                 VC4_SET_FIELD(cso->u.tex.last_level -
+                               cso->u.tex.first_level, VC4_TEX_P0_MIPLVLS) |
+                 VC4_SET_FIELD(cso->target == PIPE_TEXTURE_CUBE,
+                               VC4_TEX_P0_CMMODE));
+        so->texture_p1 =
+                (VC4_SET_FIELD(rsc->vc4_format >> 4, VC4_TEX_P1_TYPE4) |
+                 VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) |
+                 VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH));
+
+        return &so->base;
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_tiling.c b/src/gallium/drivers/vc4/vc4_tiling.c
index f9801c9cefd..cf86eb0fa31 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.c
+++ b/src/gallium/drivers/vc4/vc4_tiling.c
@@ -127,13 +127,10 @@ vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp)
 static void
 check_box_utile_alignment(const struct pipe_box *box, int cpp)
 {
-        uint32_t utile_w = vc4_utile_width(cpp);
-        uint32_t utile_h = vc4_utile_height(cpp);
-
-        assert(!(box->x & (utile_w - 1)));
-        assert(!(box->y & (utile_h - 1)));
-        assert(!(box->width & (utile_w - 1)));
-        assert(!(box->height & (utile_h - 1)));
+        assert(!(box->x & (vc4_utile_width(cpp) - 1)));
+        assert(!(box->y & (vc4_utile_height(cpp) - 1)));
+        assert(!(box->width & (vc4_utile_width(cpp) - 1)));
+        assert(!(box->height & (vc4_utile_height(cpp) - 1)));
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
index b5d10da3417..b90bba70200 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
@@ -24,9 +24,9 @@
 #ifndef VC4_TILING_H
 #define VC4_TILING_H
 
-uint32_t vc4_utile_width(int cpp);
-uint32_t vc4_utile_height(int cpp);
-bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp);
+uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST;
+uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST;
+bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
 void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
 void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
 void vc4_load_tiled_image(void *dst, uint32_t dst_stride,
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
new file mode 100644
index 00000000000..85d6998205e
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_pack_color.h"
+#include "util/format_srgb.h"
+
+#include "vc4_context.h"
+#include "vc4_qir.h"
+
+static void
+write_texture_p0(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t unit)
+{
+        struct vc4_sampler_view *sview =
+                vc4_sampler_view(texstate->textures[unit]);
+        struct vc4_resource *rsc = vc4_resource(sview->base.texture);
+
+        cl_reloc(vc4, &vc4->uniforms, uniforms, rsc->bo, sview->texture_p0);
+}
+
+static void
+write_texture_p1(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t unit)
+{
+        struct vc4_sampler_view *sview =
+                vc4_sampler_view(texstate->textures[unit]);
+        struct vc4_sampler_state *sampler =
+                vc4_sampler_state(texstate->samplers[unit]);
+
+        cl_aligned_u32(uniforms, sview->texture_p1 | sampler->texture_p1);
+}
+
+static void
+write_texture_p2(struct vc4_context *vc4,
+                 struct vc4_cl_out **uniforms,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t data)
+{
+        uint32_t unit = data & 0xffff;
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+
+        cl_aligned_u32(uniforms,
+               VC4_SET_FIELD(VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE,
+                             VC4_TEX_P2_PTYPE) |
+               VC4_SET_FIELD(rsc->cube_map_stride >> 12, VC4_TEX_P2_CMST) |
+               VC4_SET_FIELD((data >> 16) & 1, VC4_TEX_P2_BSLOD));
+}
+
+
+#define SWIZ(x,y,z,w) {          \
+        UTIL_FORMAT_SWIZZLE_##x, \
+        UTIL_FORMAT_SWIZZLE_##y, \
+        UTIL_FORMAT_SWIZZLE_##z, \
+        UTIL_FORMAT_SWIZZLE_##w  \
+}
+
+static void
+write_texture_border_color(struct vc4_context *vc4,
+                           struct vc4_cl_out **uniforms,
+                           struct vc4_texture_stateobj *texstate,
+                           uint32_t unit)
+{
+        struct pipe_sampler_state *sampler = texstate->samplers[unit];
+        struct pipe_sampler_view *texture = texstate->textures[unit];
+        struct vc4_resource *rsc = vc4_resource(texture->texture);
+        union util_color uc;
+
+        const struct util_format_description *tex_format_desc =
+                util_format_description(texture->format);
+
+        float border_color[4];
+        for (int i = 0; i < 4; i++)
+                border_color[i] = sampler->border_color.f[i];
+        if (util_format_is_srgb(texture->format)) {
+                for (int i = 0; i < 3; i++)
+                        border_color[i] =
+                                util_format_linear_to_srgb_float(border_color[i]);
+        }
+
+        /* Turn the border color into the layout of channels that it would
+         * have when stored as texture contents.
+         */
+        float storage_color[4];
+        util_format_unswizzle_4f(storage_color,
+                                 border_color,
+                                 tex_format_desc->swizzle);
+
+        /* Now, pack so that when the vc4_format-sampled texture contents are
+         * replaced with our border color, the vc4_get_format_swizzle()
+         * swizzling will get the right channels.
+         */
+        if (util_format_is_depth_or_stencil(texture->format)) {
+                uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
+                                       sampler->border_color.f[0]) << 8;
+        } else {
+                switch (rsc->vc4_format) {
+                default:
+                case VC4_TEXTURE_TYPE_RGBA8888:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_R8G8B8A8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_RGBA4444:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_A8B8G8R8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_RGB565:
+                        util_pack_color(storage_color,
+                                        PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+                        break;
+                case VC4_TEXTURE_TYPE_ALPHA:
+                        uc.ui[0] = float_to_ubyte(storage_color[0]) << 24;
+                        break;
+                case VC4_TEXTURE_TYPE_LUMALPHA:
+                        uc.ui[0] = ((float_to_ubyte(storage_color[1]) << 24) |
+                                    (float_to_ubyte(storage_color[0]) << 0));
+                        break;
+                }
+        }
+
+        cl_aligned_u32(uniforms, uc.ui[0]);
+}
+
+static uint32_t
+get_texrect_scale(struct vc4_texture_stateobj *texstate,
+                  enum quniform_contents contents,
+                  uint32_t data)
+{
+        struct pipe_sampler_view *texture = texstate->textures[data];
+        uint32_t dim;
+
+        if (contents == QUNIFORM_TEXRECT_SCALE_X)
+                dim = texture->texture->width0;
+        else
+                dim = texture->texture->height0;
+
+        return fui(1.0f / dim);
+}
+
+static struct vc4_bo *
+vc4_upload_ubo(struct vc4_context *vc4,
+               struct vc4_compiled_shader *shader,
+               const uint32_t *gallium_uniforms)
+{
+        if (!shader->ubo_size)
+                return NULL;
+
+        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, shader->ubo_size, "ubo");
+        uint32_t *data = vc4_bo_map(ubo);
+        for (uint32_t i = 0; i < shader->num_ubo_ranges; i++) {
+                memcpy(data + shader->ubo_ranges[i].dst_offset,
+                       gallium_uniforms + shader->ubo_ranges[i].src_offset,
+                       shader->ubo_ranges[i].size);
+        }
+
+        return ubo;
+}
+
+void
+vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
+                   struct vc4_constbuf_stateobj *cb,
+                   struct vc4_texture_stateobj *texstate)
+{
+        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
+        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
+        struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
+
+        cl_ensure_space(&vc4->uniforms, (uinfo->count +
+                                         uinfo->num_texture_samples) * 4);
+
+        struct vc4_cl_out *uniforms =
+                cl_start_shader_reloc(&vc4->uniforms,
+                                      uinfo->num_texture_samples);
+
+        for (int i = 0; i < uinfo->count; i++) {
+
+                switch (uinfo->contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        cl_aligned_u32(&uniforms, uinfo->data[i]);
+                        break;
+                case QUNIFORM_UNIFORM:
+                        cl_aligned_u32(&uniforms,
+                                       gallium_uniforms[uinfo->data[i]]);
+                        break;
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f);
+                        break;
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[1] * 16.0f);
+                        break;
+
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                        cl_aligned_f(&uniforms, vc4->viewport.translate[2]);
+                        break;
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        cl_aligned_f(&uniforms, vc4->viewport.scale[2]);
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        cl_aligned_f(&uniforms,
+                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P0:
+                        write_texture_p0(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                        write_texture_p1(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P2:
+                        write_texture_p2(vc4, &uniforms, texstate,
+                                         uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_UBO_ADDR:
+                        cl_aligned_reloc(vc4, &vc4->uniforms, &uniforms, ubo, 0);
+                        break;
+
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                        write_texture_border_color(vc4, &uniforms,
+                                                   texstate, uinfo->data[i]);
+                        break;
+
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        cl_aligned_u32(&uniforms,
+                                       get_texrect_scale(texstate,
+                                                         uinfo->contents[i],
+                                                         uinfo->data[i]));
+                        break;
+
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
+                        cl_aligned_f(&uniforms,
+                                     CLAMP(vc4->blend_color.color[uinfo->contents[i] -
+                                                                  QUNIFORM_BLEND_CONST_COLOR_X],
+                                           0, 1));
+                        break;
+
+                case QUNIFORM_STENCIL:
+                        cl_aligned_u32(&uniforms,
+                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
+                                       (uinfo->data[i] <= 1 ?
+                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
+                                        0));
+                        break;
+
+                case QUNIFORM_ALPHA_REF:
+                        cl_aligned_f(&uniforms,
+                                     vc4->zsa->base.alpha.ref_value);
+                        break;
+                }
+#if 0
+                uint32_t written_val = *((uint32_t *)uniforms - 1);
+                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
+                        shader, i, written_val, uif(written_val));
+#endif
+        }
+
+        cl_end(&vc4->uniforms, uniforms);
+
+        vc4_bo_unreference(&ubo);
+}
+
+void
+vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
+{
+        uint32_t dirty = 0;
+
+        for (int i = 0; i < shader->uniforms.count; i++) {
+                switch (shader->uniforms.contents[i]) {
+                case QUNIFORM_CONSTANT:
+                        break;
+                case QUNIFORM_UNIFORM:
+                case QUNIFORM_UBO_ADDR:
+                        dirty |= VC4_DIRTY_CONSTBUF;
+                        break;
+
+                case QUNIFORM_VIEWPORT_X_SCALE:
+                case QUNIFORM_VIEWPORT_Y_SCALE:
+                case QUNIFORM_VIEWPORT_Z_OFFSET:
+                case QUNIFORM_VIEWPORT_Z_SCALE:
+                        dirty |= VC4_DIRTY_VIEWPORT;
+                        break;
+
+                case QUNIFORM_USER_CLIP_PLANE:
+                        dirty |= VC4_DIRTY_CLIP;
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P0:
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                case QUNIFORM_TEXTURE_CONFIG_P2:
+                case QUNIFORM_TEXTURE_BORDER_COLOR:
+                case QUNIFORM_TEXRECT_SCALE_X:
+                case QUNIFORM_TEXRECT_SCALE_Y:
+                        dirty |= VC4_DIRTY_TEXSTATE;
+                        break;
+
+                case QUNIFORM_BLEND_CONST_COLOR_X:
+                case QUNIFORM_BLEND_CONST_COLOR_Y:
+                case QUNIFORM_BLEND_CONST_COLOR_Z:
+                case QUNIFORM_BLEND_CONST_COLOR_W:
+                        dirty |= VC4_DIRTY_BLEND_COLOR;
+                        break;
+
+                case QUNIFORM_STENCIL:
+                case QUNIFORM_ALPHA_REF:
+                        dirty |= VC4_DIRTY_ZSA;
+                        break;
+                }
+        }
+
+        shader->uniform_dirty_bits = dirty;
+}
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 0e953695b52..7eed57018b7 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -94,11 +94,6 @@ typedef unsigned char boolean;
 #endif
 #endif
 
-/* XXX: Use standard `inline` keyword instead */
-#ifndef INLINE
-#  define INLINE inline
-#endif
-
 /* Forced function inlining */
 #ifndef ALWAYS_INLINE
 #  ifdef __GNUC__
@@ -106,7 +101,7 @@ typedef unsigned char boolean;
 #  elif defined(_MSC_VER)
 #    define ALWAYS_INLINE __forceinline
 #  else
-#    define ALWAYS_INLINE INLINE
+#    define ALWAYS_INLINE inline
 #  endif
 #endif
 
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index 794aabe85f2..ac14f86fdc4 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -100,8 +100,8 @@
 #else
 #define PIPE_ARCH_SSE
 #endif
-#if defined(PIPE_CC_GCC) && !defined(__SSSE3__)
-/* #warning SSE3 support requires -msse3 compiler options */
+#if defined(PIPE_CC_GCC) && (__GNUC__ * 100 + __GNUC_MINOR__) < 409 && !defined(__SSSE3__)
+/* #warning SSE3 support requires -msse3 compiler options before GCC 4.9 */
 #else
 #define PIPE_ARCH_SSSE3
 #endif
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index c2eedf8e7c7..f89dae98a2f 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -48,6 +48,7 @@ struct pipe_depth_stencil_alpha_state;
 struct pipe_draw_info;
 struct pipe_fence_handle;
 struct pipe_framebuffer_state;
+struct pipe_image_view;
 struct pipe_index_buffer;
 struct pipe_query;
 struct pipe_poly_stipple;
@@ -57,6 +58,7 @@ struct pipe_resource;
 struct pipe_sampler_state;
 struct pipe_sampler_view;
 struct pipe_scissor_state;
+struct pipe_shader_buffer;
 struct pipe_shader_state;
 struct pipe_stencil_ref;
 struct pipe_stream_output_target;
@@ -236,20 +238,38 @@ struct pipe_context {
                           const float default_inner_level[2]);
 
    /**
-    * Bind an array of shader resources that will be used by the
-    * graphics pipeline.  Any resources that were previously bound to
-    * the specified range will be unbound after this call.
+    * Bind an array of shader buffers that will be used by a shader.
+    * Any buffers that were previously bound to the specified range
+    * will be unbound.
     *
-    * \param start      first resource to bind.
-    * \param count      number of consecutive resources to bind.
-    * \param resources  array of pointers to the resources to bind, it
+    * \param shader     selects shader stage
+    * \param start_slot first buffer slot to bind.
+    * \param count      number of consecutive buffers to bind.
+    * \param buffers    array of pointers to the buffers to bind, it
     *                   should contain at least \a count elements
-    *                   unless it's NULL, in which case no new
-    *                   resources will be bound.
+    *                   unless it's NULL, in which case no buffers will
+    *                   be bound.
     */
-   void (*set_shader_resources)(struct pipe_context *,
-                                unsigned start, unsigned count,
-                                struct pipe_surface **resources);
+   void (*set_shader_buffers)(struct pipe_context *, unsigned shader,
+                              unsigned start_slot, unsigned count,
+                              struct pipe_shader_buffer *buffers);
+
+   /**
+    * Bind an array of images that will be used by a shader.
+    * Any images that were previously bound to the specified range
+    * will be unbound.
+    *
+    * \param shader     selects shader stage
+    * \param start_slot first image slot to bind.
+    * \param count      number of consecutive images to bind.
+    * \param buffers    array of pointers to the images to bind, it
+    *                   should contain at least \a count elements
+    *                   unless it's NULL, in which case no images will
+    *                   be bound.
+    */
+   void (*set_shader_images)(struct pipe_context *, unsigned shader,
+                             unsigned start_slot, unsigned count,
+                             struct pipe_image_view **images);
 
    void (*set_vertex_buffers)( struct pipe_context *,
                                unsigned start_slot,
@@ -361,8 +381,14 @@ struct pipe_context {
                         const void *clear_value,
                         int clear_value_size);
 
-   /** Flush draw commands
+   /**
+    * Flush draw commands
     *
+    * NOTE: use screen->fence_reference() (or equivalent) to transfer
+    * new fence ref to **fence, to ensure that previous fence is unref'd
+    *
+    * \param fence  if not NULL, an old fence to unref and transfer a
+    *    new fence reference to
     * \param flags  bitfield of enum pipe_flush_flags values.
     */
    void (*flush)(struct pipe_context *pipe,
@@ -391,6 +417,17 @@ struct pipe_context {
    void (*surface_destroy)(struct pipe_context *ctx,
                            struct pipe_surface *);
 
+   /**
+    * Create an image view into a buffer or texture to be used with load,
+    * store, and atomic instructions by a shader stage.
+    */
+   struct pipe_image_view * (*create_image_view)(struct pipe_context *ctx,
+                                                 struct pipe_resource *texture,
+                                                 const struct pipe_image_view *templat);
+
+   void (*image_view_destroy)(struct pipe_context *ctx,
+                              struct pipe_image_view *view);
+
    /**
     * Map a resource.
     *
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 88b7b7699c1..2ba56eac793 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -351,9 +351,10 @@ enum pipe_flush_flags
 #define PIPE_BIND_CURSOR               (1 << 11) /* mouse cursor */
 #define PIPE_BIND_CUSTOM               (1 << 12) /* state-tracker/winsys usages */
 #define PIPE_BIND_GLOBAL               (1 << 13) /* set_global_binding */
-#define PIPE_BIND_SHADER_RESOURCE      (1 << 14) /* set_shader_resources */
-#define PIPE_BIND_COMPUTE_RESOURCE     (1 << 15) /* set_compute_resources */
-#define PIPE_BIND_COMMAND_ARGS_BUFFER  (1 << 16) /* pipe_draw_info.indirect */
+#define PIPE_BIND_SHADER_BUFFER        (1 << 14) /* set_shader_buffers */
+#define PIPE_BIND_SHADER_IMAGE         (1 << 15) /* set_shader_images */
+#define PIPE_BIND_COMPUTE_RESOURCE     (1 << 16) /* set_compute_resources */
+#define PIPE_BIND_COMMAND_ARGS_BUFFER  (1 << 17) /* pipe_draw_info.indirect */
 
 /**
  * The first two flags above were previously part of the amorphous
@@ -374,9 +375,9 @@ enum pipe_flush_flags
  * The third flag has been added to be able to force textures to be created
  * in linear mode (no tiling).
  */
-#define PIPE_BIND_SCANOUT     (1 << 17) /*  */
-#define PIPE_BIND_SHARED      (1 << 18) /* get_texture_handle ??? */
-#define PIPE_BIND_LINEAR      (1 << 19)
+#define PIPE_BIND_SCANOUT     (1 << 18) /*  */
+#define PIPE_BIND_SHARED      (1 << 19) /* get_texture_handle ??? */
+#define PIPE_BIND_LINEAR      (1 << 20)
 
 
 /**
@@ -605,6 +606,10 @@ enum pipe_cap
    PIPE_CAP_MULTISAMPLE_Z_RESOLVE,
    PIPE_CAP_RESOURCE_FROM_USER_MEMORY,
    PIPE_CAP_DEVICE_RESET_STATUS_QUERY,
+   PIPE_CAP_MAX_SHADER_PATCH_VARYINGS,
+   PIPE_CAP_TEXTURE_FLOAT_LINEAR,
+   PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR,
+   PIPE_CAP_DEPTH_BOUNDS_TEST,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -700,7 +705,8 @@ enum pipe_compute_cap
    PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
    PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY,
    PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS,
-   PIPE_COMPUTE_CAP_IMAGES_SUPPORTED
+   PIPE_COMPUTE_CAP_IMAGES_SUPPORTED,
+   PIPE_COMPUTE_CAP_SUBGROUP_SIZE
 };
 
 /**
@@ -759,6 +765,7 @@ union pipe_query_result
    /* PIPE_QUERY_PRIMITIVES_GENERATED */
    /* PIPE_QUERY_PRIMITIVES_EMITTED */
    /* PIPE_DRIVER_QUERY_TYPE_UINT64 */
+   /* PIPE_DRIVER_QUERY_TYPE_HZ */
    uint64_t u64;
 
    /* PIPE_DRIVER_QUERY_TYPE_UINT */
@@ -787,11 +794,13 @@ union pipe_color_union
 
 enum pipe_driver_query_type
 {
-   PIPE_DRIVER_QUERY_TYPE_UINT64     = 0,
-   PIPE_DRIVER_QUERY_TYPE_UINT       = 1,
-   PIPE_DRIVER_QUERY_TYPE_FLOAT      = 2,
-   PIPE_DRIVER_QUERY_TYPE_PERCENTAGE = 3,
-   PIPE_DRIVER_QUERY_TYPE_BYTES      = 4,
+   PIPE_DRIVER_QUERY_TYPE_UINT64       = 0,
+   PIPE_DRIVER_QUERY_TYPE_UINT         = 1,
+   PIPE_DRIVER_QUERY_TYPE_FLOAT        = 2,
+   PIPE_DRIVER_QUERY_TYPE_PERCENTAGE   = 3,
+   PIPE_DRIVER_QUERY_TYPE_BYTES        = 4,
+   PIPE_DRIVER_QUERY_TYPE_MICROSECONDS = 5,
+   PIPE_DRIVER_QUERY_TYPE_HZ           = 6,
 };
 
 enum pipe_driver_query_group_type
@@ -800,6 +809,15 @@ enum pipe_driver_query_group_type
    PIPE_DRIVER_QUERY_GROUP_TYPE_GPU = 1,
 };
 
+/* Whether an average value per frame or a cumulative value should be
+ * displayed.
+ */
+enum pipe_driver_query_result_type
+{
+   PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE = 0,
+   PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE = 1,
+};
+
 union pipe_numeric_type_union
 {
    uint64_t u64;
@@ -813,6 +831,7 @@ struct pipe_driver_query_info
    unsigned query_type; /* PIPE_QUERY_DRIVER_SPECIFIC + i */
    union pipe_numeric_type_union max_value; /* max value that can be returned */
    enum pipe_driver_query_type type;
+   enum pipe_driver_query_result_type result_type;
    unsigned group_id;
 };
 
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 98b2159defe..0d2658313e5 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -211,12 +211,6 @@ struct pipe_screen {
                             struct pipe_fence_handle **ptr,
                             struct pipe_fence_handle *fence );
 
-   /**
-    * Checks whether the fence has been signalled.
-    */
-   boolean (*fence_signalled)( struct pipe_screen *screen,
-                               struct pipe_fence_handle *fence );
-
    /**
     * Wait for the fence to finish.
     * \param timeout  in nanoseconds (may be PIPE_TIMEOUT_INFINITE).
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index bb57e805c29..6e07b2c5c7c 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -685,7 +685,7 @@ struct tgsi_src_register
  *
  * File, Index and Swizzle are handled the same as in tgsi_src_register.
  *
- * If ArrayID is zero the whole register file might be is indirectly addressed,
+ * If ArrayID is zero the whole register file might be indirectly addressed,
  * if not only the Declaration with this ArrayID is accessed by this operand.
  *
  */
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index a18f12e8a87..1e493f47ccf 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -61,7 +61,8 @@ extern "C" {
 #define PIPE_MAX_SHADER_INPUTS    80 /* 32 GENERIC + 32 PATCH + 16 others */
 #define PIPE_MAX_SHADER_OUTPUTS   80 /* 32 GENERIC + 32 PATCH + 16 others */
 #define PIPE_MAX_SHADER_SAMPLER_VIEWS 32
-#define PIPE_MAX_SHADER_RESOURCES 32
+#define PIPE_MAX_SHADER_BUFFERS   32
+#define PIPE_MAX_SHADER_IMAGES    32
 #define PIPE_MAX_TEXTURE_LEVELS   16
 #define PIPE_MAX_SO_BUFFERS        4
 #define PIPE_MAX_SO_OUTPUTS       64
@@ -222,6 +223,9 @@ struct pipe_depth_state
    unsigned enabled:1;         /**< depth test enabled? */
    unsigned writemask:1;       /**< allow depth buffer writes? */
    unsigned func:3;            /**< depth test func (PIPE_FUNC_x) */
+   unsigned bounds_test:1;     /**< depth bounds test enabled? */
+   float bounds_min;           /**< minimum depth bound */
+   float bounds_max;           /**< maximum depth bound */
 };
 
 
@@ -387,6 +391,31 @@ struct pipe_sampler_view
 };
 
 
+/**
+ * A view into a writable buffer or texture that can be bound to a shader
+ * stage.
+ */
+struct pipe_image_view
+{
+   struct pipe_reference reference;
+   struct pipe_resource *resource; /**< resource into which this is a view  */
+   struct pipe_context *context; /**< context this view belongs to */
+   enum pipe_format format;      /**< typed PIPE_FORMAT_x */
+
+   union {
+      struct {
+         unsigned first_layer:16;     /**< first layer to use for array textures */
+         unsigned last_layer:16;      /**< last layer to use for array textures */
+         unsigned level:8;            /**< mipmap level to use */
+      } tex;
+      struct {
+         unsigned first_element;
+         unsigned last_element;
+      } buf;
+   } u;
+};
+
+
 /**
  * Subregion of 1D/2D/3D image resource.
  */
@@ -467,6 +496,16 @@ struct pipe_constant_buffer
 };
 
 
+/**
+ * An untyped shader buffer supporting loads, stores, and atomics.
+ */
+struct pipe_shader_buffer {
+   struct pipe_resource *buffer; /**< the actual buffer */
+   unsigned buffer_offset; /**< offset to start of data in buffer, in bytes */
+   unsigned buffer_size;   /**< how much data can be read in shader */
+};
+
+
 /**
  * A stream output target. The structure specifies the range vertices can
  * be written to.
diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h
index e28d57dd3b0..9a20146f43e 100644
--- a/src/gallium/include/pipe/p_video_enums.h
+++ b/src/gallium/include/pipe/p_video_enums.h
@@ -34,7 +34,8 @@ enum pipe_video_format
    PIPE_VIDEO_FORMAT_MPEG12,   /**< MPEG1, MPEG2 */
    PIPE_VIDEO_FORMAT_MPEG4,    /**< DIVX, XVID */
    PIPE_VIDEO_FORMAT_VC1,      /**< WMV */
-   PIPE_VIDEO_FORMAT_MPEG4_AVC /**< H.264 */
+   PIPE_VIDEO_FORMAT_MPEG4_AVC,/**< H.264 */
+   PIPE_VIDEO_FORMAT_HEVC      /**< H.265 */
 };
 
 enum pipe_video_profile
@@ -54,7 +55,12 @@ enum pipe_video_profile
    PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
    PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH10,
    PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH422,
-   PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH444
+   PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH444,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN_10,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN_12,
+   PIPE_VIDEO_PROFILE_HEVC_MAIN_444
 };
 
 /* Video caps, can be different for each codec/profile */
@@ -68,7 +74,8 @@ enum pipe_video_cap
    PIPE_VIDEO_CAP_PREFERS_INTERLACED = 5,
    PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE = 6,
    PIPE_VIDEO_CAP_SUPPORTS_INTERLACED = 7,
-   PIPE_VIDEO_CAP_MAX_LEVEL = 8
+   PIPE_VIDEO_CAP_MAX_LEVEL = 8,
+   PIPE_VIDEO_CAP_STACKED_FRAMES = 9
 };
 
 enum pipe_video_entrypoint
diff --git a/src/gallium/include/pipe/p_video_state.h b/src/gallium/include/pipe/p_video_state.h
index 3713cd91b09..7d13151e643 100644
--- a/src/gallium/include/pipe/p_video_state.h
+++ b/src/gallium/include/pipe/p_video_state.h
@@ -376,6 +376,111 @@ struct pipe_h264_enc_picture_desc
    bool not_referenced;
 };
 
+struct pipe_h265_sps
+{
+   uint8_t chroma_format_idc;
+   uint8_t separate_colour_plane_flag;
+   uint32_t pic_width_in_luma_samples;
+   uint32_t pic_height_in_luma_samples;
+   uint8_t bit_depth_luma_minus8;
+   uint8_t bit_depth_chroma_minus8;
+   uint8_t log2_max_pic_order_cnt_lsb_minus4;
+   uint8_t sps_max_dec_pic_buffering_minus1;
+   uint8_t log2_min_luma_coding_block_size_minus3;
+   uint8_t log2_diff_max_min_luma_coding_block_size;
+   uint8_t log2_min_transform_block_size_minus2;
+   uint8_t log2_diff_max_min_transform_block_size;
+   uint8_t max_transform_hierarchy_depth_inter;
+   uint8_t max_transform_hierarchy_depth_intra;
+   uint8_t scaling_list_enabled_flag;
+   uint8_t ScalingList4x4[6][16];
+   uint8_t ScalingList8x8[6][64];
+   uint8_t ScalingList16x16[6][64];
+   uint8_t ScalingList32x32[2][64];
+   uint8_t ScalingListDCCoeff16x16[6];
+   uint8_t ScalingListDCCoeff32x32[2];
+   uint8_t amp_enabled_flag;
+   uint8_t sample_adaptive_offset_enabled_flag;
+   uint8_t pcm_enabled_flag;
+   uint8_t pcm_sample_bit_depth_luma_minus1;
+   uint8_t pcm_sample_bit_depth_chroma_minus1;
+   uint8_t log2_min_pcm_luma_coding_block_size_minus3;
+   uint8_t log2_diff_max_min_pcm_luma_coding_block_size;
+   uint8_t pcm_loop_filter_disabled_flag;
+   uint8_t num_short_term_ref_pic_sets;
+   uint8_t long_term_ref_pics_present_flag;
+   uint8_t num_long_term_ref_pics_sps;
+   uint8_t sps_temporal_mvp_enabled_flag;
+   uint8_t strong_intra_smoothing_enabled_flag;
+};
+
+struct pipe_h265_pps
+{
+   struct pipe_h265_sps *sps;
+
+   uint8_t dependent_slice_segments_enabled_flag;
+   uint8_t output_flag_present_flag;
+   uint8_t num_extra_slice_header_bits;
+   uint8_t sign_data_hiding_enabled_flag;
+   uint8_t cabac_init_present_flag;
+   uint8_t num_ref_idx_l0_default_active_minus1;
+   uint8_t num_ref_idx_l1_default_active_minus1;
+   int8_t init_qp_minus26;
+   uint8_t constrained_intra_pred_flag;
+   uint8_t transform_skip_enabled_flag;
+   uint8_t cu_qp_delta_enabled_flag;
+   uint8_t diff_cu_qp_delta_depth;
+   int8_t pps_cb_qp_offset;
+   int8_t pps_cr_qp_offset;
+   uint8_t pps_slice_chroma_qp_offsets_present_flag;
+   uint8_t weighted_pred_flag;
+   uint8_t weighted_bipred_flag;
+   uint8_t transquant_bypass_enabled_flag;
+   uint8_t tiles_enabled_flag;
+   uint8_t entropy_coding_sync_enabled_flag;
+   uint8_t num_tile_columns_minus1;
+   uint8_t num_tile_rows_minus1;
+   uint8_t uniform_spacing_flag;
+   uint16_t column_width_minus1[20];
+   uint16_t row_height_minus1[22];
+   uint8_t loop_filter_across_tiles_enabled_flag;
+   uint8_t pps_loop_filter_across_slices_enabled_flag;
+   uint8_t deblocking_filter_control_present_flag;
+   uint8_t deblocking_filter_override_enabled_flag;
+   uint8_t pps_deblocking_filter_disabled_flag;
+   int8_t pps_beta_offset_div2;
+   int8_t pps_tc_offset_div2;
+   uint8_t lists_modification_present_flag;
+   uint8_t log2_parallel_merge_level_minus2;
+   uint8_t slice_segment_header_extension_present_flag;
+};
+
+struct pipe_h265_picture_desc
+{
+   struct pipe_picture_desc base;
+
+   struct pipe_h265_pps *pps;
+
+   uint8_t IDRPicFlag;
+   uint8_t RAPPicFlag;
+   uint8_t CurrRpsIdx;
+   uint32_t NumPocTotalCurr;
+   uint32_t NumDeltaPocsOfRefRpsIdx;
+   uint32_t NumShortTermPictureSliceHeaderBits;
+   uint32_t NumLongTermPictureSliceHeaderBits;
+
+   int32_t CurrPicOrderCntVal;
+   struct pipe_video_buffer *ref[16];
+   int32_t PicOrderCntVal[16];
+   uint8_t IsLongTerm[16];
+   uint8_t NumPocStCurrBefore;
+   uint8_t NumPocStCurrAfter;
+   uint8_t NumPocLtCurr;
+   uint8_t RefPicSetStCurrBefore[8];
+   uint8_t RefPicSetStCurrAfter[8];
+   uint8_t RefPicSetLtCurr[8];
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/include/state_tracker/st_api.h b/src/gallium/include/state_tracker/st_api.h
index ecf1c07fb98..356863d8531 100644
--- a/src/gallium/include/state_tracker/st_api.h
+++ b/src/gallium/include/state_tracker/st_api.h
@@ -533,7 +533,7 @@ struct st_api
 /**
  * Return true if the visual has the specified buffers.
  */
-static INLINE boolean
+static inline boolean
 st_visual_have_buffers(const struct st_visual *visual, unsigned mask)
 {
    return ((visual->buffer_mask & mask) == mask);
diff --git a/src/gallium/state_trackers/clover/Makefile.am b/src/gallium/state_trackers/clover/Makefile.am
index f46d9ef457d..fd0ccf88cc5 100644
--- a/src/gallium/state_trackers/clover/Makefile.am
+++ b/src/gallium/state_trackers/clover/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include Makefile.sources
 
 AM_CPPFLAGS = \
diff --git a/src/gallium/state_trackers/clover/api/dispatch.cpp b/src/gallium/state_trackers/clover/api/dispatch.cpp
index b5a4094cf2f..f10babe31a0 100644
--- a/src/gallium/state_trackers/clover/api/dispatch.cpp
+++ b/src/gallium/state_trackers/clover/api/dispatch.cpp
@@ -123,12 +123,12 @@ namespace clover {
       clCreateImage,
       clCreateProgramWithBuiltInKernels,
       clCompileProgram,
-      NULL, // clLinkProgram
+      clLinkProgram,
       clUnloadPlatformCompiler,
-      NULL, // clGetKernelArgInfo
-      NULL, // clEnqueueFillBuffer
-      NULL, // clEnqueueFillImage
-      NULL, // clEnqueueMigrateMemObjects
+      clGetKernelArgInfo,
+      clEnqueueFillBuffer,
+      clEnqueueFillImage,
+      clEnqueueMigrateMemObjects,
       clEnqueueMarkerWithWaitList,
       clEnqueueBarrierWithWaitList,
       NULL, // clGetExtensionFunctionAddressForPlatform
diff --git a/src/gallium/state_trackers/clover/api/dispatch.hpp b/src/gallium/state_trackers/clover/api/dispatch.hpp
index ffae1ae6e12..7f622822ef9 100644
--- a/src/gallium/state_trackers/clover/api/dispatch.hpp
+++ b/src/gallium/state_trackers/clover/api/dispatch.hpp
@@ -693,7 +693,13 @@ struct _cl_icd_dispatch {
    CL_API_ENTRY cl_int (CL_API_CALL *clUnloadPlatformCompiler)(
       cl_platform_id platform);
 
-   void *clGetKernelArgInfo;
+   CL_API_ENTRY cl_int (CL_API_CALL *clGetKernelArgInfo)(
+      cl_kernel kernel,
+      cl_uint arg_indx,
+      cl_kernel_arg_info  param_name,
+      size_t param_value_size,
+      void *param_value,
+      size_t *param_value_size_ret);
 
    CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueFillBuffer)(
       cl_command_queue command_queue,
@@ -701,7 +707,7 @@ struct _cl_icd_dispatch {
       const void *pattern,
       size_t pattern_size,
       size_t offset,
-      size_t cb,
+      size_t size,
       cl_uint num_events_in_wait_list,
       const cl_event *event_wait_list,
       cl_event *event);
@@ -710,13 +716,20 @@ struct _cl_icd_dispatch {
       cl_command_queue command_queue,
       cl_mem image,
       const void *fill_color,
-      const size_t origin[3],
-      const size_t region[3],
+      const size_t *origin,
+      const size_t *region,
       cl_uint num_events_in_wait_list,
       const cl_event *event_wait_list,
       cl_event *event);
 
-   void *clEnqueueMigrateMemObjects;
+   CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueMigrateMemObjects)(
+      cl_command_queue command_queue,
+      cl_uint num_mem_objects,
+      const cl_mem *mem_objects,
+      cl_mem_migration_flags flags,
+      cl_uint num_events_in_wait_list,
+      const cl_event *event_wait_list,
+      cl_event *event);
 
    CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueMarkerWithWaitList)(
       cl_command_queue command_queue,
diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp
index 05cc392a914..73ba34abe8e 100644
--- a/src/gallium/state_trackers/clover/api/kernel.cpp
+++ b/src/gallium/state_trackers/clover/api/kernel.cpp
@@ -169,7 +169,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
       break;
 
    case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
-      buf.as_scalar<size_t>() = 1;
+      buf.as_scalar<size_t>() = dev.subgroup_size();
       break;
 
    case CL_KERNEL_PRIVATE_MEM_SIZE:
@@ -189,6 +189,14 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
    return CL_INVALID_DEVICE;
 }
 
+CLOVER_API cl_int
+clGetKernelArgInfo(cl_kernel d_kern,
+                   cl_uint idx, cl_kernel_arg_info param,
+                   size_t size, void *r_buf, size_t *r_size) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   return CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
+}
+
 namespace {
    ///
    /// Common argument checking shared by kernel invocation commands.
diff --git a/src/gallium/state_trackers/clover/api/memory.cpp b/src/gallium/state_trackers/clover/api/memory.cpp
index 3ff6ba0e1c5..1efb95b5ce7 100644
--- a/src/gallium/state_trackers/clover/api/memory.cpp
+++ b/src/gallium/state_trackers/clover/api/memory.cpp
@@ -357,9 +357,29 @@ clCreateImage(cl_context d_ctx, cl_mem_flags flags,
               const cl_image_format *format,
               const cl_image_desc *image_desc,
               void *host_ptr, cl_int *r_errcode) {
-   // This function was added in OpenCL 1.2
-   std::cerr << "CL user error: clCreateImage() not supported by OpenCL 1.1." <<
-                std::endl;
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
    ret_error(r_errcode, CL_INVALID_OPERATION);
    return NULL;
 }
+
+CLOVER_API cl_int
+clEnqueueFillBuffer(cl_command_queue command_queue, cl_mem buffer,
+                    const void *pattern, size_t pattern_size,
+                    size_t offset, size_t size,
+                    cl_uint num_events_in_wait_list,
+                    const cl_event *event_wait_list,
+                    cl_event *event) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   return CL_INVALID_VALUE;
+}
+
+CLOVER_API cl_int
+clEnqueueFillImage(cl_command_queue command_queue, cl_mem image,
+                   const void *fill_color,
+                   const size_t *origin, const size_t *region,
+                   cl_uint num_events_in_wait_list,
+                   const cl_event *event_wait_list,
+                   cl_event *event) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   return CL_INVALID_VALUE;
+}
diff --git a/src/gallium/state_trackers/clover/api/program.cpp b/src/gallium/state_trackers/clover/api/program.cpp
index e9b1f384344..27ca2efd0bc 100644
--- a/src/gallium/state_trackers/clover/api/program.cpp
+++ b/src/gallium/state_trackers/clover/api/program.cpp
@@ -231,6 +231,16 @@ clCompileProgram(cl_program d_prog, cl_uint num_devs,
    return e.get();
 }
 
+CLOVER_API cl_program
+clLinkProgram(cl_context d_ctx, cl_uint num_devs, const cl_device_id *d_devs,
+              const char *p_opts, cl_uint num_progs, const cl_program *d_progs,
+              void (*pfn_notify)(cl_program, void *), void *user_data,
+              cl_int *r_errcode) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   ret_error(r_errcode, CL_LINKER_NOT_AVAILABLE);
+   return NULL;
+}
+
 CLOVER_API cl_int
 clUnloadCompiler() {
    return CL_SUCCESS;
diff --git a/src/gallium/state_trackers/clover/api/transfer.cpp b/src/gallium/state_trackers/clover/api/transfer.cpp
index fdb9405c918..f7046253be8 100644
--- a/src/gallium/state_trackers/clover/api/transfer.cpp
+++ b/src/gallium/state_trackers/clover/api/transfer.cpp
@@ -726,3 +726,15 @@ clEnqueueUnmapMemObject(cl_command_queue d_q, cl_mem d_mem, void *ptr,
 } catch (error &e) {
    return e.get();
 }
+
+CLOVER_API cl_int
+clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+                           cl_uint num_mem_objects,
+                           const cl_mem *mem_objects,
+                           cl_mem_migration_flags flags,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list,
+                           cl_event *event) {
+   CLOVER_NOT_SUPPORTED_UNTIL("1.2");
+   return CL_INVALID_VALUE;
+}
diff --git a/src/gallium/state_trackers/clover/api/util.hpp b/src/gallium/state_trackers/clover/api/util.hpp
index 918df6125a4..31e20e424b9 100644
--- a/src/gallium/state_trackers/clover/api/util.hpp
+++ b/src/gallium/state_trackers/clover/api/util.hpp
@@ -38,6 +38,13 @@
 #define CLOVER_ICD_API PUBLIC
 #endif
 
+#define CLOVER_NOT_SUPPORTED_UNTIL(version)                    \
+   do {                                                        \
+      std::cerr << "CL user error: " << __func__               \
+                << "() requires OpenCL version " << (version)  \
+                << " or greater." << std::endl;                \
+   } while (0)
+
 namespace clover {
    ///
    /// Return an error code in \a p if non-zero.
diff --git a/src/gallium/state_trackers/clover/core/compiler.hpp b/src/gallium/state_trackers/clover/core/compiler.hpp
index c68aa39db85..207641785ca 100644
--- a/src/gallium/state_trackers/clover/core/compiler.hpp
+++ b/src/gallium/state_trackers/clover/core/compiler.hpp
@@ -37,7 +37,8 @@ namespace clover {
                                const std::string &opts,
                                std::string &r_log);
 
-   module compile_program_tgsi(const std::string &source);
+   module compile_program_tgsi(const std::string &source,
+                               std::string &r_log);
 }
 
 #endif
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
index 42b45b7f2b8..6efff79c7f4 100644
--- a/src/gallium/state_trackers/clover/core/device.cpp
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -89,12 +89,12 @@ device::vendor_id() const {
 
 size_t
 device::max_images_read() const {
-   return PIPE_MAX_SHADER_RESOURCES;
+   return PIPE_MAX_SHADER_IMAGES;
 }
 
 size_t
 device::max_images_write() const {
-   return PIPE_MAX_SHADER_RESOURCES;
+   return PIPE_MAX_SHADER_IMAGES;
 }
 
 cl_uint
@@ -185,6 +185,11 @@ device::max_block_size() const {
    return { v.begin(), v.end() };
 }
 
+cl_uint
+device::subgroup_size() const {
+   return get_compute_param<uint32_t>(pipe, PIPE_COMPUTE_CAP_SUBGROUP_SIZE)[0];
+}
+
 std::string
 device::device_name() const {
    return pipe->get_name(pipe);
diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp
index de5fc6bb9c4..285784744f3 100644
--- a/src/gallium/state_trackers/clover/core/device.hpp
+++ b/src/gallium/state_trackers/clover/core/device.hpp
@@ -67,6 +67,7 @@ namespace clover {
       bool has_doubles() const;
 
       std::vector<size_t> max_block_size() const;
+      cl_uint subgroup_size() const;
       std::string device_name() const;
       std::string vendor_name() const;
       enum pipe_shader_ir ir_format() const;
diff --git a/src/gallium/state_trackers/clover/core/error.hpp b/src/gallium/state_trackers/clover/core/error.hpp
index 780b973383a..59a5af4c799 100644
--- a/src/gallium/state_trackers/clover/core/error.hpp
+++ b/src/gallium/state_trackers/clover/core/error.hpp
@@ -65,9 +65,9 @@ namespace clover {
       cl_int code;
    };
 
-   class build_error : public error {
+   class compile_error : public error {
    public:
-      build_error(const std::string &what = "") :
+      compile_error(const std::string &what = "") :
          error(CL_COMPILE_PROGRAM_FAILURE, what) {
       }
    };
diff --git a/src/gallium/state_trackers/clover/core/event.cpp b/src/gallium/state_trackers/clover/core/event.cpp
index e1f9de07f83..d75b8397794 100644
--- a/src/gallium/state_trackers/clover/core/event.cpp
+++ b/src/gallium/state_trackers/clover/core/event.cpp
@@ -141,7 +141,7 @@ hard_event::status() const {
    else if (!_fence)
       return CL_QUEUED;
 
-   else if (!screen->fence_signalled(screen, _fence))
+   else if (!screen->fence_finish(screen, _fence, 0))
       return CL_SUBMITTED;
 
    else
diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp
index 0756f068553..a226ec1a752 100644
--- a/src/gallium/state_trackers/clover/core/kernel.cpp
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -182,6 +182,34 @@ kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
          }
          break;
       }
+      case module::argument::image_size: {
+         auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
+         std::vector<cl_uint> image_size{
+               static_cast<cl_uint>(img->width()),
+               static_cast<cl_uint>(img->height()),
+               static_cast<cl_uint>(img->depth())};
+         for (auto x : image_size) {
+            auto arg = argument::create(marg);
+
+            arg->set(sizeof(x), &x);
+            arg->bind(*this, marg);
+         }
+         break;
+      }
+      case module::argument::image_format: {
+         auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
+         cl_image_format fmt = img->format();
+         std::vector<cl_uint> image_format{
+               static_cast<cl_uint>(fmt.image_channel_data_type),
+               static_cast<cl_uint>(fmt.image_channel_order)};
+         for (auto x : image_format) {
+            auto arg = argument::create(marg);
+
+            arg->set(sizeof(x), &x);
+            arg->bind(*this, marg);
+         }
+         break;
+      }
       }
    }
 
@@ -339,6 +367,9 @@ kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
 
 void
 kernel::scalar_argument::set(size_t size, const void *value) {
+   if (!value)
+      throw error(CL_INVALID_ARG_VALUE);
+
    if (size != this->size)
       throw error(CL_INVALID_ARG_SIZE);
 
@@ -407,6 +438,9 @@ kernel::local_argument::set(size_t size, const void *value) {
    if (value)
       throw error(CL_INVALID_ARG_VALUE);
 
+   if (!size)
+      throw error(CL_INVALID_ARG_SIZE);
+
    _storage = size;
    _set = true;
 }
@@ -466,6 +500,9 @@ kernel::constant_argument::unbind(exec_context &ctx) {
 
 void
 kernel::image_rd_argument::set(size_t size, const void *value) {
+   if (!value)
+      throw error(CL_INVALID_ARG_VALUE);
+
    if (size != sizeof(cl_mem))
       throw error(CL_INVALID_ARG_SIZE);
 
@@ -494,6 +531,9 @@ kernel::image_rd_argument::unbind(exec_context &ctx) {
 
 void
 kernel::image_wr_argument::set(size_t size, const void *value) {
+   if (!value)
+      throw error(CL_INVALID_ARG_VALUE);
+
    if (size != sizeof(cl_mem))
       throw error(CL_INVALID_ARG_SIZE);
 
@@ -522,6 +562,9 @@ kernel::image_wr_argument::unbind(exec_context &ctx) {
 
 void
 kernel::sampler_argument::set(size_t size, const void *value) {
+   if (!value)
+      throw error(CL_INVALID_SAMPLER);
+
    if (size != sizeof(cl_sampler))
       throw error(CL_INVALID_ARG_SIZE);
 
diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp
index d6432a4df8d..4ba6ff467b7 100644
--- a/src/gallium/state_trackers/clover/core/kernel.hpp
+++ b/src/gallium/state_trackers/clover/core/kernel.hpp
@@ -190,7 +190,16 @@ namespace clover {
          pipe_surface *st;
       };
 
-      class image_rd_argument : public argument {
+      class image_argument : public argument {
+      public:
+         const image *get() const {
+            return img;
+         }
+      protected:
+         image *img;
+      };
+
+      class image_rd_argument : public image_argument {
       public:
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
@@ -198,11 +207,10 @@ namespace clover {
          virtual void unbind(exec_context &ctx);
 
       private:
-         image *img;
          pipe_sampler_view *st;
       };
 
-      class image_wr_argument : public argument {
+      class image_wr_argument : public image_argument {
       public:
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
@@ -210,7 +218,6 @@ namespace clover {
          virtual void unbind(exec_context &ctx);
 
       private:
-         image *img;
          pipe_surface *st;
       };
 
diff --git a/src/gallium/state_trackers/clover/core/memory.cpp b/src/gallium/state_trackers/clover/core/memory.cpp
index 055336a3325..b852e6896fe 100644
--- a/src/gallium/state_trackers/clover/core/memory.cpp
+++ b/src/gallium/state_trackers/clover/core/memory.cpp
@@ -189,7 +189,7 @@ image2d::image2d(clover::context &ctx, cl_mem_flags flags,
                  const cl_image_format *format, size_t width,
                  size_t height, size_t row_pitch,
                  void *host_ptr) :
-   image(ctx, flags, format, width, height, 0,
+   image(ctx, flags, format, width, height, 1,
          row_pitch, 0, height * row_pitch, host_ptr) {
 }
 
diff --git a/src/gallium/state_trackers/clover/core/module.hpp b/src/gallium/state_trackers/clover/core/module.hpp
index 9d656885945..5db0548872c 100644
--- a/src/gallium/state_trackers/clover/core/module.hpp
+++ b/src/gallium/state_trackers/clover/core/module.hpp
@@ -72,7 +72,9 @@ namespace clover {
          enum semantic {
             general,
             grid_dimension,
-            grid_offset
+            grid_offset,
+            image_size,
+            image_format
          };
 
          argument(enum type type, size_t size,
diff --git a/src/gallium/state_trackers/clover/core/program.cpp b/src/gallium/state_trackers/clover/core/program.cpp
index 0d6cc402db7..6eebd9c5cda 100644
--- a/src/gallium/state_trackers/clover/core/program.cpp
+++ b/src/gallium/state_trackers/clover/core/program.cpp
@@ -56,14 +56,14 @@ program::build(const ref_vector<device> &devs, const char *opts,
 
          try {
             auto module = (dev.ir_format() == PIPE_SHADER_IR_TGSI ?
-                           compile_program_tgsi(_source) :
+                           compile_program_tgsi(_source, log) :
                            compile_program_llvm(_source, headers,
                                                 dev.ir_format(),
                                                 dev.ir_target(), build_opts(dev),
                                                 log));
             _binaries.insert({ &dev, module });
             _logs.insert({ &dev, log });
-         } catch (const build_error &) {
+         } catch (const error &) {
             _logs.insert({ &dev, log });
             throw;
          }
diff --git a/src/gallium/state_trackers/clover/core/resource.cpp b/src/gallium/state_trackers/clover/core/resource.cpp
index 78ebafb644f..10a29a94eac 100644
--- a/src/gallium/state_trackers/clover/core/resource.cpp
+++ b/src/gallium/state_trackers/clover/core/resource.cpp
@@ -132,6 +132,7 @@ root_resource::root_resource(clover::device &dev, memory_obj &obj,
       info.depth0 = 1;
    }
 
+   info.array_size = 1;
    info.target = translate_target(obj.type());
    info.bind = (PIPE_BIND_SAMPLER_VIEW |
                 PIPE_BIND_COMPUTE_RESOURCE |
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 9b91fee9032..63c3f8ee49b 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -108,7 +108,7 @@ namespace {
          name, llvm::MemoryBuffer::getMemBuffer(source));
 
       if (!c.ExecuteAction(act))
-         throw build_error(log);
+         throw compile_error(log);
    }
 
    module
@@ -256,7 +256,7 @@ namespace {
       r_log = log;
 
       if (!ExecSuccess)
-         throw build_error();
+         throw compile_error();
 
       // Get address spaces map to be able to find kernel argument address space
       memcpy(address_spaces, c.getTarget().getAddressSpaceMap(),
@@ -269,17 +269,19 @@ namespace {
 #endif
    }
 
-   void
-   find_kernels(llvm::Module *mod, std::vector<llvm::Function *> &kernels) {
+   std::vector<llvm::Function *>
+   find_kernels(const llvm::Module *mod) {
       const llvm::NamedMDNode *kernel_node =
                                  mod->getNamedMetadata("opencl.kernels");
       // This means there are no kernels in the program.  The spec does not
       // require that we return an error here, but there will be an error if
       // the user tries to pass this program to a clCreateKernel() call.
       if (!kernel_node) {
-         return;
+         return std::vector<llvm::Function *>();
       }
 
+      std::vector<llvm::Function *> kernels;
+      kernels.reserve(kernel_node->getNumOperands());
       for (unsigned i = 0; i < kernel_node->getNumOperands(); ++i) {
 #if HAVE_LLVM >= 0x0306
          kernels.push_back(llvm::mdconst::dyn_extract<llvm::Function>(
@@ -288,11 +290,11 @@ namespace {
 #endif
                                     kernel_node->getOperand(i)->getOperand(0)));
       }
+      return kernels;
    }
 
    void
-   optimize(llvm::Module *mod, unsigned optimization_level,
-            const std::vector<llvm::Function *> &kernels) {
+   optimize(llvm::Module *mod, unsigned optimization_level) {
 
 #if HAVE_LLVM >= 0x0307
       llvm::legacy::PassManager PM;
@@ -300,6 +302,8 @@ namespace {
       llvm::PassManager PM;
 #endif
 
+      const std::vector<llvm::Function *> kernels = find_kernels(mod);
+
       // Add a function internalizer pass.
       //
       // By default, the function internalizer pass will look for a function
@@ -340,18 +344,91 @@ namespace {
       PM.run(*mod);
    }
 
+   // Kernel metadata
+
+   const llvm::MDNode *
+   get_kernel_metadata(const llvm::Function *kernel_func) {
+      auto mod = kernel_func->getParent();
+      auto kernels_node = mod->getNamedMetadata("opencl.kernels");
+      if (!kernels_node) {
+         return nullptr;
+      }
+
+      const llvm::MDNode *kernel_node = nullptr;
+      for (unsigned i = 0; i < kernels_node->getNumOperands(); ++i) {
+#if HAVE_LLVM >= 0x0306
+         auto func = llvm::mdconst::dyn_extract<llvm::Function>(
+#else
+         auto func = llvm::dyn_cast<llvm::Function>(
+#endif
+                                    kernels_node->getOperand(i)->getOperand(0));
+         if (func == kernel_func) {
+            kernel_node = kernels_node->getOperand(i);
+            break;
+         }
+      }
+
+      return kernel_node;
+   }
+
+   llvm::MDNode*
+   node_from_op_checked(const llvm::MDOperand &md_operand,
+                        llvm::StringRef expect_name,
+                        unsigned expect_num_args)
+   {
+      auto node = llvm::cast<llvm::MDNode>(md_operand);
+      assert(node->getNumOperands() == expect_num_args &&
+             "Wrong number of operands.");
+
+      auto str_node = llvm::cast<llvm::MDString>(node->getOperand(0));
+      assert(str_node->getString() == expect_name &&
+             "Wrong metadata node name.");
+
+      return node;
+   }
+
+   struct kernel_arg_md {
+      llvm::StringRef type_name;
+      llvm::StringRef access_qual;
+      kernel_arg_md(llvm::StringRef type_name_, llvm::StringRef access_qual_):
+         type_name(type_name_), access_qual(access_qual_) {}
+   };
+
+   std::vector<kernel_arg_md>
+   get_kernel_arg_md(const llvm::Function *kernel_func) {
+      auto num_args = kernel_func->getArgumentList().size();
+
+      auto kernel_node = get_kernel_metadata(kernel_func);
+      auto aq = node_from_op_checked(kernel_node->getOperand(2),
+                                     "kernel_arg_access_qual", num_args + 1);
+      auto ty = node_from_op_checked(kernel_node->getOperand(3),
+                                     "kernel_arg_type", num_args + 1);
+
+      std::vector<kernel_arg_md> res;
+      res.reserve(num_args);
+      for (unsigned i = 0; i < num_args; ++i) {
+         res.push_back(kernel_arg_md(
+            llvm::cast<llvm::MDString>(ty->getOperand(i+1))->getString(),
+            llvm::cast<llvm::MDString>(aq->getOperand(i+1))->getString()));
+      }
+
+      return res;
+   }
+
    std::vector<module::argument>
    get_kernel_args(const llvm::Module *mod, const std::string &kernel_name,
                    const clang::LangAS::Map &address_spaces) {
 
       std::vector<module::argument> args;
       llvm::Function *kernel_func = mod->getFunction(kernel_name);
+      assert(kernel_func && "Kernel name not found in module.");
+      auto arg_md = get_kernel_arg_md(kernel_func);
 
       llvm::DataLayout TD(mod);
+      llvm::Type *size_type =
+         TD.getSmallestLegalIntType(mod->getContext(), sizeof(cl_uint) * 8);
 
-      for (llvm::Function::const_arg_iterator I = kernel_func->arg_begin(),
-                                      E = kernel_func->arg_end(); I != E; ++I) {
-         const llvm::Argument &arg = *I;
+      for (const auto &arg: kernel_func->args()) {
 
          llvm::Type *arg_type = arg.getType();
          const unsigned arg_store_size = TD.getTypeStoreSize(arg_type);
@@ -369,6 +446,59 @@ namespace {
          unsigned target_size = TD.getTypeStoreSize(target_type);
          unsigned target_align = TD.getABITypeAlignment(target_type);
 
+         llvm::StringRef type_name = arg_md[arg.getArgNo()].type_name;
+         llvm::StringRef access_qual = arg_md[arg.getArgNo()].access_qual;
+
+         // Image
+         const bool is_image2d = type_name == "image2d_t";
+         const bool is_image3d = type_name == "image3d_t";
+         if (is_image2d || is_image3d) {
+            const bool is_write_only = access_qual == "write_only";
+            const bool is_read_only = access_qual == "read_only";
+
+            typename module::argument::type marg_type;
+            if (is_image2d && is_read_only) {
+               marg_type = module::argument::image2d_rd;
+            } else if (is_image2d && is_write_only) {
+               marg_type = module::argument::image2d_wr;
+            } else if (is_image3d && is_read_only) {
+               marg_type = module::argument::image3d_rd;
+            } else if (is_image3d && is_write_only) {
+               marg_type = module::argument::image3d_wr;
+            } else {
+               assert(0 && "Wrong image access qualifier");
+            }
+
+            args.push_back(module::argument(marg_type,
+                                            arg_store_size, target_size,
+                                            target_align,
+                                            module::argument::zero_ext));
+            continue;
+         }
+
+         // Image size implicit argument
+         if (type_name == "__llvm_image_size") {
+            args.push_back(module::argument(module::argument::scalar,
+                                            sizeof(cl_uint),
+                                            TD.getTypeStoreSize(size_type),
+                                            TD.getABITypeAlignment(size_type),
+                                            module::argument::zero_ext,
+                                            module::argument::image_size));
+            continue;
+         }
+
+         // Image format implicit argument
+         if (type_name == "__llvm_image_format") {
+            args.push_back(module::argument(module::argument::scalar,
+                                            sizeof(cl_uint),
+                                            TD.getTypeStoreSize(size_type),
+                                            TD.getABITypeAlignment(size_type),
+                                            module::argument::zero_ext,
+                                            module::argument::image_format));
+            continue;
+         }
+
+         // Other types
          if (llvm::isa<llvm::PointerType>(arg_type) && arg.hasByValAttr()) {
             arg_type =
                   llvm::dyn_cast<llvm::PointerType>(arg_type)->getElementType();
@@ -413,9 +543,6 @@ namespace {
       // Append implicit arguments.  XXX - The types, ordering and
       // vector size of the implicit arguments should depend on the
       // target according to the selected calling convention.
-      llvm::Type *size_type =
-         TD.getSmallestLegalIntType(mod->getContext(), sizeof(cl_uint) * 8);
-
       args.push_back(
          module::argument(module::argument::scalar, sizeof(cl_uint),
                           TD.getTypeStoreSize(size_type),
@@ -435,7 +562,6 @@ namespace {
 
    module
    build_module_llvm(llvm::Module *mod,
-                     const std::vector<llvm::Function *> &kernels,
                      clang::LangAS::Map& address_spaces) {
 
       module m;
@@ -445,8 +571,11 @@ namespace {
       llvm::raw_svector_ostream bitcode_ostream(llvm_bitcode);
       llvm::BitstreamWriter writer(llvm_bitcode);
       llvm::WriteBitcodeToFile(mod, bitcode_ostream);
+#if HAVE_LLVM < 0x0308
       bitcode_ostream.flush();
+#endif
 
+      const std::vector<llvm::Function *> kernels = find_kernels(mod);
       for (unsigned i = 0; i < kernels.size(); ++i) {
          std::string kernel_name = kernels[i]->getName();
          std::vector<module::argument> args =
@@ -485,7 +614,7 @@ namespace {
       LLVMDisposeMessage(err_message);
 
       if (err) {
-         throw build_error();
+         throw compile_error();
       }
    }
 
@@ -505,7 +634,7 @@ namespace {
       if (LLVMGetTargetFromTriple(triple.c_str(), &target, &error_message)) {
          r_log = std::string(error_message);
          LLVMDisposeMessage(error_message);
-         throw build_error();
+         throw compile_error();
       }
 
       LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
@@ -514,7 +643,7 @@ namespace {
 
       if (!tm) {
          r_log = "Could not create TargetMachine: " + triple;
-         throw build_error();
+         throw compile_error();
       }
 
       if (dump_asm) {
@@ -567,7 +696,7 @@ namespace {
             const char *name;
             if (gelf_getshdr(section, &symtab_header) != &symtab_header) {
                r_log = "Failed to read ELF section header.";
-               throw build_error();
+               throw compile_error();
             }
             name = elf_strptr(elf, section_str_index, symtab_header.sh_name);
            if (!strcmp(name, ".symtab")) {
@@ -577,9 +706,9 @@ namespace {
          }
          if (!symtab) {
             r_log = "Unable to find symbol table.";
-            throw build_error();
+            throw compile_error();
          }
-      } catch (build_error &e) {
+      } catch (compile_error &e) {
          elf_end(elf);
          throw e;
       }
@@ -610,10 +739,11 @@ namespace {
    module
    build_module_native(std::vector<char> &code,
                        const llvm::Module *mod,
-                       const std::vector<llvm::Function *> &kernels,
                        const clang::LangAS::Map &address_spaces,
                        std::string &r_log) {
 
+      const std::vector<llvm::Function *> kernels = find_kernels(mod);
+
       std::map<std::string, unsigned> kernel_offsets =
             get_kernel_offsets(code, kernels, r_log);
 
@@ -650,7 +780,7 @@ namespace {
          stream.flush();
          *(std::string*)data = message;
 
-         throw build_error();
+         throw compile_error();
       }
    }
 
@@ -697,7 +827,6 @@ clover::compile_program_llvm(const std::string &source,
 
    init_targets();
 
-   std::vector<llvm::Function *> kernels;
    size_t processor_str_len = std::string(target).find_first_of("-");
    std::string processor(target, 0, processor_str_len);
    std::string triple(target, processor_str_len + 1,
@@ -717,9 +846,7 @@ clover::compile_program_llvm(const std::string &source,
                                     triple, processor, opts, address_spaces,
                                     optimization_level, r_log);
 
-   find_kernels(mod, kernels);
-
-   optimize(mod, optimization_level, kernels);
+   optimize(mod, optimization_level);
 
    if (get_debug_flags() & DBG_LLVM) {
       std::string log;
@@ -738,13 +865,13 @@ clover::compile_program_llvm(const std::string &source,
          m = module();
          break;
       case PIPE_SHADER_IR_LLVM:
-         m = build_module_llvm(mod, kernels, address_spaces);
+         m = build_module_llvm(mod, address_spaces);
          break;
       case PIPE_SHADER_IR_NATIVE: {
          std::vector<char> code = compile_native(mod, triple, processor,
                                                  get_debug_flags() & DBG_ASM,
                                                  r_log);
-         m = build_module_native(code, mod, kernels, address_spaces, r_log);
+         m = build_module_native(code, mod, address_spaces, r_log);
          break;
       }
    }
diff --git a/src/gallium/state_trackers/clover/tgsi/compiler.cpp b/src/gallium/state_trackers/clover/tgsi/compiler.cpp
index b70104e7604..54cb747e6fb 100644
--- a/src/gallium/state_trackers/clover/tgsi/compiler.cpp
+++ b/src/gallium/state_trackers/clover/tgsi/compiler.cpp
@@ -32,7 +32,7 @@ using namespace clover;
 
 namespace {
    void
-   read_header(const std::string &header, module &m) {
+   read_header(const std::string &header, module &m, std::string &r_log) {
       std::istringstream ls(header);
       std::string line;
 
@@ -45,8 +45,10 @@ namespace {
          if (!(ts >> name))
             continue;
 
-         if (!(ts >> offset))
-            throw build_error("invalid kernel start address");
+         if (!(ts >> offset)) {
+            r_log = "invalid kernel start address";
+            throw compile_error();
+         }
 
          while (ts >> tok) {
             if (tok == "scalar")
@@ -67,8 +69,10 @@ namespace {
                args.push_back({ module::argument::image3d_wr, 4 });
             else if (tok == "sampler")
                args.push_back({ module::argument::sampler, 0 });
-            else
-               throw build_error("invalid kernel argument");
+            else {
+               r_log = "invalid kernel argument";
+               throw compile_error();
+            }
          }
 
          m.syms.push_back({ name, 0, offset, args });
@@ -76,11 +80,13 @@ namespace {
    }
 
    void
-   read_body(const char *source, module &m) {
+   read_body(const char *source, module &m, std::string &r_log) {
       tgsi_token prog[1024];
 
-      if (!tgsi_text_translate(source, prog, Elements(prog)))
-         throw build_error("translate failed");
+      if (!tgsi_text_translate(source, prog, Elements(prog))) {
+         r_log = "translate failed";
+         throw compile_error();
+      }
 
       unsigned sz = tgsi_num_tokens(prog) * sizeof(tgsi_token);
       std::vector<char> data( (char *)prog, (char *)prog + sz );
@@ -89,13 +95,13 @@ namespace {
 }
 
 module
-clover::compile_program_tgsi(const std::string &source) {
+clover::compile_program_tgsi(const std::string &source, std::string &r_log) {
    const size_t body_pos = source.find("COMP\n");
    const char *body = &source[body_pos];
    module m;
 
-   read_header({ source.begin(), source.begin() + body_pos }, m);
-   read_body(body, m);
+   read_header({ source.begin(), source.begin() + body_pos }, m, r_log);
+   read_body(body, m, r_log);
 
    return m;
 }
diff --git a/src/gallium/state_trackers/dri/Android.mk b/src/gallium/state_trackers/dri/Android.mk
index 188e4a1404d..43f0de9b464 100644
--- a/src/gallium/state_trackers/dri/Android.mk
+++ b/src/gallium/state_trackers/dri/Android.mk
@@ -44,14 +44,10 @@ LOCAL_STATIC_LIBRARIES := \
 	libmesa_dri_common \
 
 ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
-LOCAL_CFLAGS += -DGALLIUM_SOFTPIPE
 LOCAL_SRC_FILES += $(drisw_SOURCES)
 endif
 
-# swrast only?
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS += -D__NOT_HAVE_DRM_H
-else
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
 LOCAL_SRC_FILES += $(dri2_SOURCES)
 LOCAL_SHARED_LIBRARIES := libdrm
 endif
diff --git a/src/gallium/state_trackers/dri/Makefile.am b/src/gallium/state_trackers/dri/Makefile.am
index d2c7a82d720..9f4deba0c1e 100644
--- a/src/gallium/state_trackers/dri/Makefile.am
+++ b/src/gallium/state_trackers/dri/Makefile.am
@@ -50,10 +50,6 @@ noinst_LTLIBRARIES = libdri.la
 libdri_la_SOURCES = $(common_SOURCES)
 
 if HAVE_DRISW
-if !HAVE_DRI2
-AM_CPPFLAGS += \
-	-D__NOT_HAVE_DRM_H
-endif
 libdri_la_SOURCES += $(drisw_SOURCES)
 endif
 
diff --git a/src/gallium/state_trackers/dri/SConscript b/src/gallium/state_trackers/dri/SConscript
index 89b5e611c2e..657300baf13 100644
--- a/src/gallium/state_trackers/dri/SConscript
+++ b/src/gallium/state_trackers/dri/SConscript
@@ -5,10 +5,7 @@ Import('*')
 
 env = env.Clone()
 
-# XXX: If HAVE_DRI2
 env.PkgUseModules(['DRM'])
-# else
-#env.Append(CPPDEFINES = [('__NOT_HAVE_DRM_H', '1')])
 
 env.Append(CPPPATH = [
     '#/src',
@@ -20,7 +17,6 @@ env.Append(CPPPATH = [
 
 env.Append(CPPDEFINES = [
     ('GALLIUM_STATIC_TARGETS', '1'),
-    'GALLIUM_SOFTPIPE',
 ])
 
 sources = env.ParseSourceList('Makefile.sources', 'common_SOURCES')
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index 8d93f786433..91b443147d6 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -554,7 +554,7 @@ dri2_allocate_textures(struct dri_context *ctx,
 
          if (drawable->textures[statt]) {
             templ.format = drawable->textures[statt]->format;
-            templ.bind = drawable->textures[statt]->bind;
+            templ.bind = drawable->textures[statt]->bind & ~PIPE_BIND_SCANOUT;
             templ.nr_samples = drawable->stvis.samples;
 
             /* Try to reuse the resource.
@@ -1460,7 +1460,7 @@ dri2_init_screen(__DRIscreen * sPriv)
    throttle_ret = dd_configuration(DRM_CONF_THROTTLE);
    dmabuf_ret = dd_configuration(DRM_CONF_SHARE_FD);
 #else
-   if (pipe_loader_drm_probe_fd(&screen->dev, screen->fd, false)) {
+   if (pipe_loader_drm_probe_fd(&screen->dev, screen->fd)) {
       pscreen = pipe_loader_create_screen(screen->dev, PIPE_SEARCH_DIR);
 
       throttle_ret = pipe_loader_configuration(screen->dev, DRM_CONF_THROTTLE);
diff --git a/src/gallium/state_trackers/dri/dri2_buffer.h b/src/gallium/state_trackers/dri/dri2_buffer.h
index e8e474ddb76..0cee4e906e6 100644
--- a/src/gallium/state_trackers/dri/dri2_buffer.h
+++ b/src/gallium/state_trackers/dri/dri2_buffer.h
@@ -11,7 +11,7 @@ struct dri2_buffer
    struct pipe_resource *resource;
 };
 
-static INLINE struct dri2_buffer *
+static inline struct dri2_buffer *
 dri2_buffer(__DRIbuffer * driBufferPriv)
 {
    return (struct dri2_buffer *) driBufferPriv;
diff --git a/src/gallium/state_trackers/dri/dri_context.h b/src/gallium/state_trackers/dri/dri_context.h
index 56dfa2ccc70..96f06442fa0 100644
--- a/src/gallium/state_trackers/dri/dri_context.h
+++ b/src/gallium/state_trackers/dri/dri_context.h
@@ -59,7 +59,7 @@ struct dri_context
    struct hud_context *hud;
 };
 
-static INLINE struct dri_context *
+static inline struct dri_context *
 dri_context(__DRIcontext * driContextPriv)
 {
    if (!driContextPriv)
diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c
index b8afe6c4d23..0d2929aaaa1 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -279,7 +279,12 @@ dri_drawable_get_format(struct dri_drawable *drawable,
    case ST_ATTACHMENT_BACK_LEFT:
    case ST_ATTACHMENT_FRONT_RIGHT:
    case ST_ATTACHMENT_BACK_RIGHT:
-      *format = drawable->stvis.color_format;
+      /* Other pieces of the driver stack get confused and behave incorrectly
+       * when they get an sRGB drawable. st/mesa receives "drawable->stvis"
+       * though other means and handles it correctly, so we don't really need
+       * to use an sRGB format here.
+       */
+      *format = util_format_linear(drawable->stvis.color_format);
       *bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
       break;
    case ST_ATTACHMENT_DEPTH_STENCIL:
diff --git a/src/gallium/state_trackers/dri/dri_drawable.h b/src/gallium/state_trackers/dri/dri_drawable.h
index c5142181e89..1f9842ea541 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.h
+++ b/src/gallium/state_trackers/dri/dri_drawable.h
@@ -87,7 +87,7 @@ struct dri_drawable
                              struct pipe_resource *res);
 };
 
-static INLINE struct dri_drawable *
+static inline struct dri_drawable *
 dri_drawable(__DRIdrawable * driDrawPriv)
 {
    return (struct dri_drawable *) (driDrawPriv)
diff --git a/src/gallium/state_trackers/dri/dri_query_renderer.c b/src/gallium/state_trackers/dri/dri_query_renderer.c
index 4a28ac37b70..ea31b6c1e10 100644
--- a/src/gallium/state_trackers/dri/dri_query_renderer.c
+++ b/src/gallium/state_trackers/dri/dri_query_renderer.c
@@ -42,6 +42,20 @@ dri2_query_renderer_integer(__DRIscreen *_screen, int param,
                                                       PIPE_CAP_UMA);
       return 0;
 
+   case __DRI2_RENDERER_HAS_TEXTURE_3D:
+      value[0] =
+         screen->base.screen->get_param(screen->base.screen,
+                                        PIPE_CAP_MAX_TEXTURE_3D_LEVELS) != 0;
+      return 0;
+
+   case __DRI2_RENDERER_HAS_FRAMEBUFFER_SRGB:
+      value[0] =
+         screen->base.screen->is_format_supported(screen->base.screen,
+                                                  PIPE_FORMAT_B8G8R8A8_SRGB,
+                                                  PIPE_TEXTURE_2D, 0,
+                                                  PIPE_BIND_RENDER_TARGET);
+      return 0;
+
    default:
       return driQueryRendererIntegerCommon(_screen, param, value);
    }
diff --git a/src/gallium/state_trackers/dri/dri_screen.c b/src/gallium/state_trackers/dri/dri_screen.c
index 85393d867e4..c4c2d9c8fb1 100644
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -103,14 +103,18 @@ dri_fill_st_options(struct st_config_options *options,
 static const __DRIconfig **
 dri_fill_in_modes(struct dri_screen *screen)
 {
-   static const mesa_format mesa_formats[3] = {
+   static const mesa_format mesa_formats[] = {
       MESA_FORMAT_B8G8R8A8_UNORM,
       MESA_FORMAT_B8G8R8X8_UNORM,
+      MESA_FORMAT_B8G8R8A8_SRGB,
+      MESA_FORMAT_B8G8R8X8_SRGB,
       MESA_FORMAT_B5G6R5_UNORM,
    };
-   static const enum pipe_format pipe_formats[3] = {
+   static const enum pipe_format pipe_formats[] = {
       PIPE_FORMAT_BGRA8888_UNORM,
       PIPE_FORMAT_BGRX8888_UNORM,
+      PIPE_FORMAT_BGRA8888_SRGB,
+      PIPE_FORMAT_BGRX8888_SRGB,
       PIPE_FORMAT_B5G6R5_UNORM,
    };
    mesa_format format;
@@ -186,6 +190,11 @@ dri_fill_in_modes(struct dri_screen *screen)
       unsigned num_msaa_modes = 0; /* includes a single-sample mode */
       uint8_t msaa_modes[MSAA_VISUAL_MAX_SAMPLES];
 
+      if (!p_screen->is_format_supported(p_screen, pipe_formats[format],
+                                         PIPE_TEXTURE_2D, 0,
+                                         PIPE_BIND_RENDER_TARGET))
+         continue;
+
       for (i = 1; i <= msaa_samples_max; i++) {
          int samples = i > 1 ? i : 0;
 
@@ -241,9 +250,15 @@ dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
 
    if (mode->redBits == 8) {
       if (mode->alphaBits == 8)
-         stvis->color_format = PIPE_FORMAT_BGRA8888_UNORM;
+         if (mode->sRGBCapable)
+            stvis->color_format = PIPE_FORMAT_BGRA8888_SRGB;
+         else
+            stvis->color_format = PIPE_FORMAT_BGRA8888_UNORM;
       else
-         stvis->color_format = PIPE_FORMAT_BGRX8888_UNORM;
+         if (mode->sRGBCapable)
+            stvis->color_format = PIPE_FORMAT_BGRX8888_SRGB;
+         else
+            stvis->color_format = PIPE_FORMAT_BGRX8888_UNORM;
    } else {
       stvis->color_format = PIPE_FORMAT_B5G6R5_UNORM;
    }
diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h
index 173f4038cdb..4bcb0291d86 100644
--- a/src/gallium/state_trackers/dri/dri_screen.h
+++ b/src/gallium/state_trackers/dri/dri_screen.h
@@ -97,7 +97,7 @@ struct dri_screen
 };
 
 /** cast wrapper */
-static INLINE struct dri_screen *
+static inline struct dri_screen *
 dri_screen(__DRIscreen * sPriv)
 {
    return (struct dri_screen *)sPriv->driverPrivate;
@@ -122,9 +122,7 @@ struct __DRIimageRec {
 
 };
 
-#ifndef __NOT_HAVE_DRM_H
-
-static INLINE boolean
+static inline boolean
 dri_with_format(__DRIscreen * sPriv)
 {
    const __DRIdri2LoaderExtension *loader = sPriv->dri2.loader;
@@ -134,16 +132,6 @@ dri_with_format(__DRIscreen * sPriv)
        && (loader->getBuffersWithFormat != NULL);
 }
 
-#else
-
-static INLINE boolean
-dri_with_format(__DRIscreen * sPriv)
-{
-   return TRUE;
-}
-
-#endif
-
 void
 dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
                    const struct gl_config *mode);
diff --git a/src/gallium/state_trackers/dri/drisw.c b/src/gallium/state_trackers/dri/drisw.c
index 4a2c1bbc2ee..4ec6992643a 100644
--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -50,7 +50,7 @@
 DEBUG_GET_ONCE_BOOL_OPTION(swrast_no_present, "SWRAST_NO_PRESENT", FALSE);
 static boolean swrast_no_present = FALSE;
 
-static INLINE void
+static inline void
 get_drawable_info(__DRIdrawable *dPriv, int *x, int *y, int *w, int *h)
 {
    __DRIscreen *sPriv = dPriv->driScreenPriv;
@@ -61,7 +61,7 @@ get_drawable_info(__DRIdrawable *dPriv, int *x, int *y, int *w, int *h)
                            dPriv->loaderPrivate);
 }
 
-static INLINE void
+static inline void
 put_image(__DRIdrawable *dPriv, void *data, unsigned width, unsigned height)
 {
    __DRIscreen *sPriv = dPriv->driScreenPriv;
@@ -72,7 +72,7 @@ put_image(__DRIdrawable *dPriv, void *data, unsigned width, unsigned height)
                     data, dPriv->loaderPrivate);
 }
 
-static INLINE void
+static inline void
 put_image2(__DRIdrawable *dPriv, void *data, int x, int y,
            unsigned width, unsigned height, unsigned stride)
 {
@@ -84,7 +84,7 @@ put_image2(__DRIdrawable *dPriv, void *data, int x, int y,
                      data, dPriv->loaderPrivate);
 }
 
-static INLINE void
+static inline void
 get_image(__DRIdrawable *dPriv, int x, int y, int width, int height, void *data)
 {
    __DRIscreen *sPriv = dPriv->driScreenPriv;
@@ -123,7 +123,7 @@ drisw_put_image2(struct dri_drawable *drawable,
    put_image2(dPriv, data, x, y, width, height, stride);
 }
 
-static INLINE void
+static inline void
 drisw_present_texture(__DRIdrawable *dPriv,
                       struct pipe_resource *ptex, struct pipe_box *sub_box)
 {
@@ -136,7 +136,7 @@ drisw_present_texture(__DRIdrawable *dPriv,
    screen->base.screen->flush_frontbuffer(screen->base.screen, ptex, 0, 0, drawable, sub_box);
 }
 
-static INLINE void
+static inline void
 drisw_invalidate_drawable(__DRIdrawable *dPriv)
 {
    struct dri_drawable *drawable = dri_drawable(dPriv);
@@ -146,7 +146,7 @@ drisw_invalidate_drawable(__DRIdrawable *dPriv)
    p_atomic_inc(&drawable->base.stamp);
 }
 
-static INLINE void
+static inline void
 drisw_copy_to_front(__DRIdrawable * dPriv,
                     struct pipe_resource *ptex)
 {
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.h b/src/gallium/state_trackers/glx/xlib/xm_api.h
index 6d0bc3f4d81..ffdffc0940f 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.h
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.h
@@ -378,13 +378,13 @@ xmesa_check_buffer_size(XMesaBuffer b);
 extern void
 xmesa_destroy_buffers_on_display(Display *dpy);
 
-static INLINE GLuint
+static inline GLuint
 xmesa_buffer_width(XMesaBuffer b)
 {
    return b->width;
 }
 
-static INLINE GLuint
+static inline GLuint
 xmesa_buffer_height(XMesaBuffer b)
 {
    return b->height;
diff --git a/src/gallium/state_trackers/glx/xlib/xm_st.c b/src/gallium/state_trackers/glx/xlib/xm_st.c
index 7f73a3a44fe..9d0f2d25025 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_st.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_st.c
@@ -46,7 +46,7 @@ struct xmesa_st_framebuffer {
 };
 
 
-static INLINE struct xmesa_st_framebuffer *
+static inline struct xmesa_st_framebuffer *
 xmesa_st_framebuffer(struct st_framebuffer_iface *stfbi)
 {
    return (struct xmesa_st_framebuffer *) stfbi->st_manager_private;
diff --git a/src/gallium/state_trackers/hgl/hgl.c b/src/gallium/state_trackers/hgl/hgl.c
index 1e804c07e6b..0e122fe86ae 100644
--- a/src/gallium/state_trackers/hgl/hgl.c
+++ b/src/gallium/state_trackers/hgl/hgl.c
@@ -32,7 +32,7 @@
 
 
 // Perform a safe void to hgl_context cast
-static INLINE struct hgl_context*
+static inline struct hgl_context*
 hgl_st_context(struct st_context_iface *stctxi)
 {
 	struct hgl_context* context;
@@ -44,7 +44,7 @@ hgl_st_context(struct st_context_iface *stctxi)
 
 
 // Perform a safe void to hgl_buffer cast
-static INLINE struct hgl_buffer*
+static inline struct hgl_buffer*
 hgl_st_framebuffer(struct st_framebuffer_iface *stfbi)
 {
 	struct hgl_buffer* buffer;
diff --git a/src/gallium/state_trackers/nine/adapter9.c b/src/gallium/state_trackers/nine/adapter9.c
index 9d6d6590e00..c5ffcb15a18 100644
--- a/src/gallium/state_trackers/nine/adapter9.c
+++ b/src/gallium/state_trackers/nine/adapter9.c
@@ -163,7 +163,7 @@ NineAdapter9_GetAdapterIdentifier( struct NineAdapter9 *This,
     return D3D_OK;
 }
 
-static INLINE boolean
+static inline boolean
 backbuffer_format( D3DFORMAT dfmt,
                    D3DFORMAT bfmt,
                    boolean win )
@@ -220,7 +220,7 @@ NineAdapter9_CheckDeviceType( struct NineAdapter9 *This,
     return D3D_OK;
 }
 
-static INLINE boolean
+static inline boolean
 display_format( D3DFORMAT fmt,
                 boolean win )
 {
diff --git a/src/gallium/state_trackers/nine/adapter9.h b/src/gallium/state_trackers/nine/adapter9.h
index df85b2dcc28..2129ec8edc0 100644
--- a/src/gallium/state_trackers/nine/adapter9.h
+++ b/src/gallium/state_trackers/nine/adapter9.h
@@ -49,7 +49,7 @@ struct NineAdapter9
     
     struct d3dadapter9_context *ctx;
 };
-static INLINE struct NineAdapter9 *
+static inline struct NineAdapter9 *
 NineAdapter9( void *data )
 {
     return (struct NineAdapter9 *)data;
diff --git a/src/gallium/state_trackers/nine/authenticatedchannel9.h b/src/gallium/state_trackers/nine/authenticatedchannel9.h
index 7d374f67fca..63cb2269db4 100644
--- a/src/gallium/state_trackers/nine/authenticatedchannel9.h
+++ b/src/gallium/state_trackers/nine/authenticatedchannel9.h
@@ -29,7 +29,7 @@ struct NineAuthenticatedChannel9
 {
     struct NineUnknown base;
 };
-static INLINE struct NineAuthenticatedChannel9 *
+static inline struct NineAuthenticatedChannel9 *
 NineAuthenticatedChannel9( void *data )
 {
     return (struct NineAuthenticatedChannel9 *)data;
diff --git a/src/gallium/state_trackers/nine/basetexture9.h b/src/gallium/state_trackers/nine/basetexture9.h
index c803280decd..9d6fb0c002a 100644
--- a/src/gallium/state_trackers/nine/basetexture9.h
+++ b/src/gallium/state_trackers/nine/basetexture9.h
@@ -53,7 +53,7 @@ struct NineBaseTexture9
         DWORD lod_resident;
     } managed;
 };
-static INLINE struct NineBaseTexture9 *
+static inline struct NineBaseTexture9 *
 NineBaseTexture9( void *data )
 {
     return (struct NineBaseTexture9 *)data;
@@ -107,7 +107,7 @@ HRESULT
 NineBaseTexture9_UpdateSamplerView( struct NineBaseTexture9 *This,
                                     const int sRGB );
 
-static INLINE void
+static inline void
 NineBaseTexture9_Validate( struct NineBaseTexture9 *This )
 {
     DBG_FLAG(DBG_BASETEXTURE, "This=%p dirty=%i dirty_mip=%i lod=%u/%u\n",
@@ -119,7 +119,7 @@ NineBaseTexture9_Validate( struct NineBaseTexture9 *This )
         NineBaseTexture9_GenerateMipSubLevels(This);
 }
 
-static INLINE struct pipe_sampler_view *
+static inline struct pipe_sampler_view *
 NineBaseTexture9_GetSamplerView( struct NineBaseTexture9 *This, const int sRGB )
 {
     if (!This->view[sRGB])
@@ -131,7 +131,7 @@ NineBaseTexture9_GetSamplerView( struct NineBaseTexture9 *This, const int sRGB )
 void
 NineBaseTexture9_Dump( struct NineBaseTexture9 *This );
 #else
-static INLINE void
+static inline void
 NineBaseTexture9_Dump( struct NineBaseTexture9 *This ) { }
 #endif
 
diff --git a/src/gallium/state_trackers/nine/cryptosession9.h b/src/gallium/state_trackers/nine/cryptosession9.h
index 660d246bfa0..d1eab72eb37 100644
--- a/src/gallium/state_trackers/nine/cryptosession9.h
+++ b/src/gallium/state_trackers/nine/cryptosession9.h
@@ -29,7 +29,7 @@ struct NineCryptoSession9
 {
     struct NineUnknown base;
 };
-static INLINE struct NineCryptoSession9 *
+static inline struct NineCryptoSession9 *
 NineCryptoSession9( void *data )
 {
     return (struct NineCryptoSession9 *)data;
diff --git a/src/gallium/state_trackers/nine/cubetexture9.h b/src/gallium/state_trackers/nine/cubetexture9.h
index ee7e275e4d8..999715c0a74 100644
--- a/src/gallium/state_trackers/nine/cubetexture9.h
+++ b/src/gallium/state_trackers/nine/cubetexture9.h
@@ -33,7 +33,7 @@ struct NineCubeTexture9
     struct pipe_box dirty_rect[6]; /* covers all mip levels */
     uint8_t *managed_buffer;
 };
-static INLINE struct NineCubeTexture9 *
+static inline struct NineCubeTexture9 *
 NineCubeTexture9( void *data )
 {
     return (struct NineCubeTexture9 *)data;
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 466b9376ce5..55948cbb67f 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -510,7 +510,7 @@ NineDevice9_GetCaps( struct NineDevice9 *This )
     return &This->caps;
 }
 
-static INLINE void
+static inline void
 NineDevice9_PauseRecording( struct NineDevice9 *This )
 {
     if (This->record) {
@@ -519,7 +519,7 @@ NineDevice9_PauseRecording( struct NineDevice9 *This )
     }
 }
 
-static INLINE void
+static inline void
 NineDevice9_ResumeRecording( struct NineDevice9 *This )
 {
     if (This->record) {
@@ -2697,7 +2697,7 @@ NineDevice9_GetNPatchMode( struct NineDevice9 *This )
     STUB(0);
 }
 
-static INLINE void
+static inline void
 init_draw_info(struct pipe_draw_info *info,
                struct NineDevice9 *dev, D3DPRIMITIVETYPE type, UINT count)
 {
diff --git a/src/gallium/state_trackers/nine/device9.h b/src/gallium/state_trackers/nine/device9.h
index c66a273bf2e..74607451c5f 100644
--- a/src/gallium/state_trackers/nine/device9.h
+++ b/src/gallium/state_trackers/nine/device9.h
@@ -132,7 +132,7 @@ struct NineDevice9
      * is not bound to anything by the vertex declaration */
     struct pipe_resource *dummy_vbo;
 };
-static INLINE struct NineDevice9 *
+static inline struct NineDevice9 *
 NineDevice9( void *data )
 {
     return (struct NineDevice9 *)data;
diff --git a/src/gallium/state_trackers/nine/device9ex.h b/src/gallium/state_trackers/nine/device9ex.h
index a31c720553a..8375622d8a1 100644
--- a/src/gallium/state_trackers/nine/device9ex.h
+++ b/src/gallium/state_trackers/nine/device9ex.h
@@ -29,7 +29,7 @@ struct NineDevice9Ex
 {
     struct NineDevice9 base;
 };
-static INLINE struct NineDevice9Ex *
+static inline struct NineDevice9Ex *
 NineDevice9Ex( void *data )
 {
     return (struct NineDevice9Ex *)data;
diff --git a/src/gallium/state_trackers/nine/device9video.h b/src/gallium/state_trackers/nine/device9video.h
index ca041e55fbc..fc2faeb624a 100644
--- a/src/gallium/state_trackers/nine/device9video.h
+++ b/src/gallium/state_trackers/nine/device9video.h
@@ -29,7 +29,7 @@ struct NineDevice9Video
 {
     struct NineUnknown base;
 };
-static INLINE struct NineDevice9Video *
+static inline struct NineDevice9Video *
 NineDevice9Video( void *data )
 {
     return (struct NineDevice9Video *)data;
diff --git a/src/gallium/state_trackers/nine/indexbuffer9.h b/src/gallium/state_trackers/nine/indexbuffer9.h
index 0982a93fbb1..f10578f47ba 100644
--- a/src/gallium/state_trackers/nine/indexbuffer9.h
+++ b/src/gallium/state_trackers/nine/indexbuffer9.h
@@ -45,7 +45,7 @@ struct NineIndexBuffer9
 
     D3DINDEXBUFFER_DESC desc;
 };
-static INLINE struct NineIndexBuffer9 *
+static inline struct NineIndexBuffer9 *
 NineIndexBuffer9( void *data )
 {
     return (struct NineIndexBuffer9 *)data;
diff --git a/src/gallium/state_trackers/nine/iunknown.h b/src/gallium/state_trackers/nine/iunknown.h
index 4c83ddd8e4e..628d984553e 100644
--- a/src/gallium/state_trackers/nine/iunknown.h
+++ b/src/gallium/state_trackers/nine/iunknown.h
@@ -52,7 +52,7 @@ struct NineUnknown
 
     void (*dtor)(void *data); /* top-level dtor */
 };
-static INLINE struct NineUnknown *
+static inline struct NineUnknown *
 NineUnknown( void *data )
 {
     return (struct NineUnknown *)data;
@@ -94,14 +94,14 @@ NineUnknown_GetDevice( struct NineUnknown *This,
 
 /*** Nine private methods ***/
 
-static INLINE void
+static inline void
 NineUnknown_Destroy( struct NineUnknown *This )
 {
     assert(!(This->refs | This->bind));
     This->dtor(This);
 }
 
-static INLINE UINT
+static inline UINT
 NineUnknown_Bind( struct NineUnknown *This )
 {
     UINT b = ++This->bind;
@@ -113,7 +113,7 @@ NineUnknown_Bind( struct NineUnknown *This )
     return b;
 }
 
-static INLINE UINT
+static inline UINT
 NineUnknown_Unbind( struct NineUnknown *This )
 {
     UINT b = --This->bind;
@@ -129,7 +129,7 @@ NineUnknown_Unbind( struct NineUnknown *This )
     return b;
 }
 
-static INLINE void
+static inline void
 NineUnknown_ConvertRefToBind( struct NineUnknown *This )
 {
     NineUnknown_Bind(This);
@@ -137,7 +137,7 @@ NineUnknown_ConvertRefToBind( struct NineUnknown *This )
 }
 
 /* Detach from container. */
-static INLINE void
+static inline void
 NineUnknown_Detach( struct NineUnknown *This )
 {
     assert(This->container && !This->forward);
diff --git a/src/gallium/state_trackers/nine/nine_dump.h b/src/gallium/state_trackers/nine/nine_dump.h
index d0d4a9eb3aa..a0ffe7bf6ab 100644
--- a/src/gallium/state_trackers/nine/nine_dump.h
+++ b/src/gallium/state_trackers/nine/nine_dump.h
@@ -31,19 +31,19 @@ nine_dump_D3DTSS_value(unsigned, D3DTEXTURESTAGESTATETYPE, DWORD);
 
 #else /* !DEBUG */
 
-static INLINE void
+static inline void
 nine_dump_D3DADAPTER_IDENTIFIER9(unsigned ch, const D3DADAPTER_IDENTIFIER9 *id)
 { }
-static INLINE void
+static inline void
 nine_dump_D3DCAPS9(unsigned ch, const D3DCAPS9 *caps)
 { }
-static INLINE void
+static inline void
 nine_dump_D3DLIGHT9(unsigned ch, const D3DLIGHT9 *light)
 { }
-static INLINE void
+static inline void
 nine_dump_D3DMATERIAL9(unsigned ch, const D3DMATERIAL9 *mat)
 { }
-static INLINE void
+static inline void
 nine_dump_D3DTSS_value(unsigned ch, D3DTEXTURESTAGESTATETYPE tss, DWORD value)
 { }
 
diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c
index c2213e6bf11..8a53f0d9038 100644
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -295,7 +295,7 @@ struct vs_build_ctx
     struct ureg_src mtlE;
 };
 
-static INLINE unsigned
+static inline unsigned
 get_texcoord_sn(struct pipe_screen *screen)
 {
     if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
@@ -303,7 +303,7 @@ get_texcoord_sn(struct pipe_screen *screen)
     return TGSI_SEMANTIC_GENERIC;
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
 {
     const unsigned i = vs->num_inputs++;
@@ -313,7 +313,7 @@ build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
 }
 
 /* NOTE: dst may alias src */
-static INLINE void
+static inline void
 ureg_normalize3(struct ureg_program *ureg,
                 struct ureg_dst dst, struct ureg_src src,
                 struct ureg_dst tmp)
@@ -1033,7 +1033,7 @@ static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
     }
 }
 
-static INLINE boolean
+static inline boolean
 is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
 {
     return !dst.WriteMask ||
@@ -1973,7 +1973,7 @@ nine_D3DMATRIX_print(const D3DMATRIX *M)
 }
 */
 
-static INLINE float
+static inline float
 nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
 {
     return A->m[r][0] * B->m[0][c] +
@@ -1982,7 +1982,7 @@ nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
            A->m[r][3] * B->m[3][c];
 }
 
-static INLINE float
+static inline float
 nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
 {
     return v->x * M->m[0][c] +
@@ -1991,7 +1991,7 @@ nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
            1.0f * M->m[3][c];
 }
 
-static INLINE float
+static inline float
 nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
 {
     return v->x * M->m[0][c] +
diff --git a/src/gallium/state_trackers/nine/nine_helpers.c b/src/gallium/state_trackers/nine/nine_helpers.c
index ed179f9aedc..98c2ae30eba 100644
--- a/src/gallium/state_trackers/nine/nine_helpers.c
+++ b/src/gallium/state_trackers/nine/nine_helpers.c
@@ -49,7 +49,7 @@ nine_range_pool_more(struct nine_range_pool *pool)
     return pool->free;
 }
 
-static INLINE struct nine_range *
+static inline struct nine_range *
 nine_range_pool_get(struct nine_range_pool *pool, int16_t bgn, int16_t end)
 {
     struct nine_range *r = pool->free;
@@ -62,7 +62,7 @@ nine_range_pool_get(struct nine_range_pool *pool, int16_t bgn, int16_t end)
     return r;
 }
 
-static INLINE void
+static inline void
 nine_ranges_coalesce(struct nine_range *r, struct nine_range_pool *pool)
 {
     struct nine_range *n;
diff --git a/src/gallium/state_trackers/nine/nine_helpers.h b/src/gallium/state_trackers/nine/nine_helpers.h
index 6751a822ec2..b382c5b72b3 100644
--- a/src/gallium/state_trackers/nine/nine_helpers.h
+++ b/src/gallium/state_trackers/nine/nine_helpers.h
@@ -123,7 +123,7 @@ static inline void _nine_bind(void **dst, void *obj)
     } \
     return D3D_OK
 
-static INLINE float asfloat(DWORD value)
+static inline float asfloat(DWORD value)
 {
     union {
         float f;
@@ -149,14 +149,14 @@ struct nine_range_pool
     unsigned num_slabs_max;
 };
 
-static INLINE void
+static inline void
 nine_range_pool_put(struct nine_range_pool *pool, struct nine_range *r)
 {
     r->next = pool->free;
     pool->free = r;
 }
 
-static INLINE void
+static inline void
 nine_range_pool_put_chain(struct nine_range_pool *pool,
                           struct nine_range *head,
                           struct nine_range *tail)
diff --git a/src/gallium/state_trackers/nine/nine_pipe.c b/src/gallium/state_trackers/nine/nine_pipe.c
index 0da0b20263d..4cf37b9f59c 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.c
+++ b/src/gallium/state_trackers/nine/nine_pipe.c
@@ -118,7 +118,7 @@ nine_convert_rasterizer_state(struct cso_context *ctx, const DWORD *rs)
     cso_set_rasterizer(ctx, &rast);
 }
 
-static INLINE void
+static inline void
 nine_convert_blend_state_fixup(struct pipe_blend_state *blend, const DWORD *rs)
 {
     if (unlikely(rs[D3DRS_SRCBLEND] == D3DBLEND_BOTHSRCALPHA ||
diff --git a/src/gallium/state_trackers/nine/nine_pipe.h b/src/gallium/state_trackers/nine/nine_pipe.h
index 91da5630122..43a7737cdf9 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.h
+++ b/src/gallium/state_trackers/nine/nine_pipe.h
@@ -43,7 +43,7 @@ void nine_convert_sampler_state(struct cso_context *, int idx, const DWORD *);
 
 void nine_pipe_context_clear(struct NineDevice9 *);
 
-static INLINE unsigned d3dlock_buffer_to_pipe_transfer_usage(DWORD Flags)
+static inline unsigned d3dlock_buffer_to_pipe_transfer_usage(DWORD Flags)
 {
     unsigned usage;
 
@@ -70,7 +70,7 @@ static INLINE unsigned d3dlock_buffer_to_pipe_transfer_usage(DWORD Flags)
     return usage;
 }
 
-static INLINE void
+static inline void
 rect_to_pipe_box(struct pipe_box *dst, const RECT *src)
 {
     dst->x = src->left;
@@ -81,7 +81,7 @@ rect_to_pipe_box(struct pipe_box *dst, const RECT *src)
     dst->depth = 1;
 }
 
-static INLINE boolean
+static inline boolean
 rect_to_pipe_box_clamp(struct pipe_box *dst, const RECT *src)
 {
     rect_to_pipe_box(dst, src);
@@ -95,7 +95,7 @@ rect_to_pipe_box_clamp(struct pipe_box *dst, const RECT *src)
     return FALSE;
 }
 
-static INLINE boolean
+static inline boolean
 rect_to_pipe_box_flip(struct pipe_box *dst, const RECT *src)
 {
     rect_to_pipe_box(dst, src);
@@ -107,7 +107,7 @@ rect_to_pipe_box_flip(struct pipe_box *dst, const RECT *src)
     return TRUE;
 }
 
-static INLINE void
+static inline void
 rect_to_pipe_box_xy_only(struct pipe_box *dst, const RECT *src)
 {
     user_warn(src->left > src->right || src->top > src->bottom);
@@ -118,7 +118,7 @@ rect_to_pipe_box_xy_only(struct pipe_box *dst, const RECT *src)
     dst->height = src->bottom - src->top;
 }
 
-static INLINE boolean
+static inline boolean
 rect_to_pipe_box_xy_only_clamp(struct pipe_box *dst, const RECT *src)
 {
     rect_to_pipe_box_xy_only(dst, src);
@@ -132,7 +132,7 @@ rect_to_pipe_box_xy_only_clamp(struct pipe_box *dst, const RECT *src)
     return FALSE;
 }
 
-static INLINE void
+static inline void
 rect_to_g3d_u_rect(struct u_rect *dst, const RECT *src)
 {
     user_warn(src->left > src->right || src->top > src->bottom);
@@ -143,7 +143,7 @@ rect_to_g3d_u_rect(struct u_rect *dst, const RECT *src)
     dst->y1 = src->bottom;
 }
 
-static INLINE void
+static inline void
 d3dbox_to_pipe_box(struct pipe_box *dst, const D3DBOX *src)
 {
     user_warn(src->Left > src->Right);
@@ -158,13 +158,13 @@ d3dbox_to_pipe_box(struct pipe_box *dst, const D3DBOX *src)
     dst->depth = src->Back - src->Front;
 }
 
-static INLINE D3DFORMAT
+static inline D3DFORMAT
 pipe_to_d3d9_format(enum pipe_format format)
 {
     return nine_pipe_to_d3d9_format_map[format];
 }
 
-static INLINE boolean
+static inline boolean
 depth_stencil_format( D3DFORMAT fmt )
 {
     static D3DFORMAT allowed[] = {
@@ -190,7 +190,7 @@ depth_stencil_format( D3DFORMAT fmt )
     return FALSE;
 }
 
-static INLINE unsigned
+static inline unsigned
 d3d9_get_pipe_depth_format_bindings(D3DFORMAT format)
 {
     switch (format) {
@@ -215,7 +215,7 @@ d3d9_get_pipe_depth_format_bindings(D3DFORMAT format)
     }
 }
 
-static INLINE enum pipe_format
+static inline enum pipe_format
 d3d9_to_pipe_format_internal(D3DFORMAT format)
 {
     if (format <= D3DFMT_A2B10G10R10_XR_BIAS)
@@ -257,7 +257,7 @@ d3d9_to_pipe_format_internal(D3DFORMAT format)
     screen->is_format_supported(screen, pipe_format, target, \
                                 sample_count, bindings)
 
-static INLINE enum pipe_format
+static inline enum pipe_format
 d3d9_to_pipe_format_checked(struct pipe_screen *screen,
                             D3DFORMAT format,
                             enum pipe_texture_target target,
@@ -298,7 +298,7 @@ d3d9_to_pipe_format_checked(struct pipe_screen *screen,
     return PIPE_FORMAT_NONE;
 }
 
-static INLINE const char *
+static inline const char *
 d3dformat_to_string(D3DFORMAT fmt)
 {
     switch (fmt) {
@@ -381,7 +381,7 @@ d3dformat_to_string(D3DFORMAT fmt)
     return "Unknown";
 }
 
-static INLINE unsigned
+static inline unsigned
 nine_fvf_stride( DWORD fvf )
 {
     unsigned texcount, i, size = 0;
@@ -428,7 +428,7 @@ nine_fvf_stride( DWORD fvf )
     return size;
 }
 
-static INLINE void
+static inline void
 d3dcolor_to_rgba(float *rgba, D3DCOLOR color)
 {
     rgba[0] = (float)((color >> 16) & 0xFF) / 0xFF;
@@ -437,13 +437,13 @@ d3dcolor_to_rgba(float *rgba, D3DCOLOR color)
     rgba[3] = (float)((color >> 24) & 0xFF) / 0xFF;
 }
 
-static INLINE void
+static inline void
 d3dcolor_to_pipe_color_union(union pipe_color_union *rgba, D3DCOLOR color)
 {
     d3dcolor_to_rgba(&rgba->f[0], color);
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dprimitivetype_to_pipe_prim(D3DPRIMITIVETYPE prim)
 {
     switch (prim) {
@@ -459,7 +459,7 @@ d3dprimitivetype_to_pipe_prim(D3DPRIMITIVETYPE prim)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 prim_count_to_vertex_count(D3DPRIMITIVETYPE prim, UINT count)
 {
     switch (prim) {
@@ -475,7 +475,7 @@ prim_count_to_vertex_count(D3DPRIMITIVETYPE prim, UINT count)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dcmpfunc_to_pipe_func(D3DCMPFUNC func)
 {
     switch (func) {
@@ -494,7 +494,7 @@ d3dcmpfunc_to_pipe_func(D3DCMPFUNC func)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dstencilop_to_pipe_stencil_op(D3DSTENCILOP op)
 {
     switch (op) {
@@ -511,7 +511,7 @@ d3dstencilop_to_pipe_stencil_op(D3DSTENCILOP op)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dcull_to_pipe_face(D3DCULL cull)
 {
     switch (cull) {
@@ -524,7 +524,7 @@ d3dcull_to_pipe_face(D3DCULL cull)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dfillmode_to_pipe_polygon_mode(D3DFILLMODE mode)
 {
     switch (mode) {
@@ -538,7 +538,7 @@ d3dfillmode_to_pipe_polygon_mode(D3DFILLMODE mode)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dblendop_to_pipe_blend(D3DBLENDOP op)
 {
     switch (op) {
@@ -557,7 +557,7 @@ d3dblendop_to_pipe_blend(D3DBLENDOP op)
  * Drivers may check RGB and ALPHA factors for equality so we should not
  * simply substitute the ALPHA variants.
  */
-static INLINE unsigned
+static inline unsigned
 d3dblend_alpha_to_pipe_blendfactor(D3DBLEND b)
 {
     switch (b) {
@@ -584,7 +584,7 @@ d3dblend_alpha_to_pipe_blendfactor(D3DBLEND b)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dblend_color_to_pipe_blendfactor(D3DBLEND b)
 {
     switch (b) {
@@ -611,7 +611,7 @@ d3dblend_color_to_pipe_blendfactor(D3DBLEND b)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dtextureaddress_to_pipe_tex_wrap(D3DTEXTUREADDRESS addr)
 {
     switch (addr) {
@@ -626,7 +626,7 @@ d3dtextureaddress_to_pipe_tex_wrap(D3DTEXTUREADDRESS addr)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dtexturefiltertype_to_pipe_tex_filter(D3DTEXTUREFILTERTYPE filter)
 {
     switch (filter) {
@@ -644,7 +644,7 @@ d3dtexturefiltertype_to_pipe_tex_filter(D3DTEXTUREFILTERTYPE filter)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 d3dtexturefiltertype_to_pipe_tex_mipfilter(D3DTEXTUREFILTERTYPE filter)
 {
     switch (filter) {
@@ -662,7 +662,7 @@ d3dtexturefiltertype_to_pipe_tex_mipfilter(D3DTEXTUREFILTERTYPE filter)
     }
 }
 
-static INLINE unsigned nine_format_get_stride(enum pipe_format format,
+static inline unsigned nine_format_get_stride(enum pipe_format format,
                                               unsigned width)
 {
     unsigned stride = util_format_get_stride(format, width);
@@ -670,7 +670,7 @@ static INLINE unsigned nine_format_get_stride(enum pipe_format format,
     return align(stride, 4);
 }
 
-static INLINE unsigned nine_format_get_level_alloc_size(enum pipe_format format,
+static inline unsigned nine_format_get_level_alloc_size(enum pipe_format format,
                                                         unsigned width,
                                                         unsigned height,
                                                         unsigned level)
@@ -684,7 +684,7 @@ static INLINE unsigned nine_format_get_level_alloc_size(enum pipe_format format,
     return size;
 }
 
-static INLINE unsigned nine_format_get_size_and_offsets(enum pipe_format format,
+static inline unsigned nine_format_get_size_and_offsets(enum pipe_format format,
                                                         unsigned *offsets,
                                                         unsigned width,
                                                         unsigned height,
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index 22a58825f78..754f5af6b8e 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -43,7 +43,7 @@ struct shader_translator;
 
 typedef HRESULT (*translate_instruction_func)(struct shader_translator *);
 
-static INLINE const char *d3dsio_to_string(unsigned opcode);
+static inline const char *d3dsio_to_string(unsigned opcode);
 
 
 #define NINED3D_SM1_VS 0xfffe
@@ -239,7 +239,7 @@ struct sm1_dst_param
     BYTE type;
 };
 
-static INLINE void
+static inline void
 assert_replicate_swizzle(const struct ureg_src *reg)
 {
     assert(reg->SwizzleY == reg->SwizzleX &&
@@ -608,7 +608,7 @@ tx_set_lconstb(struct shader_translator *tx, INT index, BOOL b)
        ureg_imm1f(tx->ureg, b ? 1.0f : 0.0f);
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 tx_scratch(struct shader_translator *tx)
 {
     if (tx->num_scratch >= Elements(tx->regs.t)) {
@@ -620,13 +620,13 @@ tx_scratch(struct shader_translator *tx)
     return tx->regs.t[tx->num_scratch++];
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 tx_scratch_scalar(struct shader_translator *tx)
 {
     return ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
 }
 
-static INLINE struct ureg_src
+static inline struct ureg_src
 tx_src_scalar(struct ureg_dst dst)
 {
     struct ureg_src src = ureg_src(dst);
@@ -636,7 +636,7 @@ tx_src_scalar(struct ureg_dst dst)
     return src;
 }
 
-static INLINE void
+static inline void
 tx_temp_alloc(struct shader_translator *tx, INT idx)
 {
     assert(idx >= 0);
@@ -654,7 +654,7 @@ tx_temp_alloc(struct shader_translator *tx, INT idx)
         tx->regs.r[idx] = ureg_DECL_temporary(tx->ureg);
 }
 
-static INLINE void
+static inline void
 tx_addr_alloc(struct shader_translator *tx, INT idx)
 {
     assert(idx == 0);
@@ -664,7 +664,7 @@ tx_addr_alloc(struct shader_translator *tx, INT idx)
         tx->regs.a0 = ureg_DECL_temporary(tx->ureg);
 }
 
-static INLINE void
+static inline void
 tx_pred_alloc(struct shader_translator *tx, INT idx)
 {
     assert(idx == 0);
@@ -672,7 +672,7 @@ tx_pred_alloc(struct shader_translator *tx, INT idx)
         tx->regs.p = ureg_DECL_predicate(tx->ureg);
 }
 
-static INLINE void
+static inline void
 tx_texcoord_alloc(struct shader_translator *tx, INT idx)
 {
     assert(IS_PS);
@@ -682,7 +682,7 @@ tx_texcoord_alloc(struct shader_translator *tx, INT idx)
                                              TGSI_INTERPOLATE_PERSPECTIVE);
 }
 
-static INLINE unsigned *
+static inline unsigned *
 tx_bgnloop(struct shader_translator *tx)
 {
     tx->loop_depth++;
@@ -692,7 +692,7 @@ tx_bgnloop(struct shader_translator *tx)
     return &tx->loop_labels[tx->loop_depth - 1];
 }
 
-static INLINE unsigned *
+static inline unsigned *
 tx_endloop(struct shader_translator *tx)
 {
     assert(tx->loop_depth);
@@ -741,7 +741,7 @@ tx_get_loopal(struct shader_translator *tx)
     return ureg_src_undef();
 }
 
-static INLINE unsigned *
+static inline unsigned *
 tx_cond(struct shader_translator *tx)
 {
    assert(tx->cond_depth <= NINE_MAX_COND_DEPTH);
@@ -749,14 +749,14 @@ tx_cond(struct shader_translator *tx)
    return &tx->cond_labels[tx->cond_depth - 1];
 }
 
-static INLINE unsigned *
+static inline unsigned *
 tx_elsecond(struct shader_translator *tx)
 {
    assert(tx->cond_depth);
    return &tx->cond_labels[tx->cond_depth - 1];
 }
 
-static INLINE void
+static inline void
 tx_endcond(struct shader_translator *tx)
 {
    assert(tx->cond_depth);
@@ -765,7 +765,7 @@ tx_endcond(struct shader_translator *tx)
                     ureg_get_instruction_number(tx->ureg));
 }
 
-static INLINE struct ureg_dst
+static inline struct ureg_dst
 nine_ureg_dst_register(unsigned file, int index)
 {
     return ureg_dst(ureg_src_register(file, index));
@@ -1240,7 +1240,7 @@ NineTranslateInstruction_Mkxn(struct shader_translator *tx, const unsigned k, co
 #define VNOTSUPPORTED   0, 0
 #define V(maj, min)     (((maj) << 8) | (min))
 
-static INLINE const char *
+static inline const char *
 d3dsio_to_string( unsigned opcode )
 {
     static const char *names[] = {
@@ -1657,7 +1657,7 @@ DECL_SPECIAL(IF)
     return D3D_OK;
 }
 
-static INLINE unsigned
+static inline unsigned
 sm1_insn_flags_to_tgsi_setop(BYTE flags)
 {
     switch (flags) {
@@ -1724,7 +1724,7 @@ static const char *sm1_declusage_names[] =
     [D3DDECLUSAGE_SAMPLE] = "SAMPLE"
 };
 
-static INLINE unsigned
+static inline unsigned
 sm1_to_nine_declusage(struct sm1_semantic *dcl)
 {
     return nine_d3d9_to_nine_declusage(dcl->usage, dcl->usage_idx);
@@ -1833,7 +1833,7 @@ sm1_declusage_to_tgsi(struct tgsi_declaration_semantic *sem,
 #define NINED3DSTT_2D     (D3DSTT_2D >> D3DSP_TEXTURETYPE_SHIFT)
 #define NINED3DSTT_VOLUME (D3DSTT_VOLUME >> D3DSP_TEXTURETYPE_SHIFT)
 #define NINED3DSTT_CUBE   (D3DSTT_CUBE >> D3DSP_TEXTURETYPE_SHIFT)
-static INLINE unsigned
+static inline unsigned
 d3dstt_to_tgsi_tex(BYTE sampler_type)
 {
     switch (sampler_type) {
@@ -1846,7 +1846,7 @@ d3dstt_to_tgsi_tex(BYTE sampler_type)
         return TGSI_TEXTURE_UNKNOWN;
     }
 }
-static INLINE unsigned
+static inline unsigned
 d3dstt_to_tgsi_tex_shadow(BYTE sampler_type)
 {
     switch (sampler_type) {
@@ -1859,7 +1859,7 @@ d3dstt_to_tgsi_tex_shadow(BYTE sampler_type)
         return TGSI_TEXTURE_UNKNOWN;
     }
 }
-static INLINE unsigned
+static inline unsigned
 ps1x_sampler_type(const struct nine_shader_info *info, unsigned stage)
 {
     switch ((info->sampler_ps1xtypes >> (stage * 2)) & 0x3) {
@@ -1884,7 +1884,7 @@ sm1_sampler_type_name(BYTE sampler_type)
     }
 }
 
-static INLINE unsigned
+static inline unsigned
 nine_tgsi_to_interp_mode(struct tgsi_declaration_semantic *sem)
 {
     switch (sem->Name) {
@@ -2685,7 +2685,7 @@ create_op_info_map(struct shader_translator *tx)
     }
 }
 
-static INLINE HRESULT
+static inline HRESULT
 NineTranslateInstruction_Generic(struct shader_translator *tx)
 {
     struct ureg_dst dst[1];
@@ -2703,19 +2703,19 @@ NineTranslateInstruction_Generic(struct shader_translator *tx)
     return D3D_OK;
 }
 
-static INLINE DWORD
+static inline DWORD
 TOKEN_PEEK(struct shader_translator *tx)
 {
     return *(tx->parse);
 }
 
-static INLINE DWORD
+static inline DWORD
 TOKEN_NEXT(struct shader_translator *tx)
 {
     return *(tx->parse)++;
 }
 
-static INLINE void
+static inline void
 TOKEN_JUMP(struct shader_translator *tx)
 {
     if (tx->parse_next && tx->parse != tx->parse_next) {
@@ -2724,7 +2724,7 @@ TOKEN_JUMP(struct shader_translator *tx)
     }
 }
 
-static INLINE boolean
+static inline boolean
 sm1_parse_eof(struct shader_translator *tx)
 {
     return TOKEN_PEEK(tx) == NINED3DSP_END;
@@ -3063,7 +3063,7 @@ tx_dtor(struct shader_translator *tx)
     FREE(tx);
 }
 
-static INLINE unsigned
+static inline unsigned
 tgsi_processor_from_type(unsigned shader_type)
 {
     switch (shader_type) {
diff --git a/src/gallium/state_trackers/nine/nine_shader.h b/src/gallium/state_trackers/nine/nine_shader.h
index 56c5d99b4d2..ec256c153a9 100644
--- a/src/gallium/state_trackers/nine/nine_shader.h
+++ b/src/gallium/state_trackers/nine/nine_shader.h
@@ -70,19 +70,19 @@ struct nine_shader_info
     struct nine_lconstf lconstf; /* out, NOTE: members to be free'd by user */
 };
 
-static INLINE void
+static inline void
 nine_info_mark_const_f_used(struct nine_shader_info *info, int idx)
 {
     if (info->const_float_slots < (idx + 1))
         info->const_float_slots = idx + 1;
 }
-static INLINE void
+static inline void
 nine_info_mark_const_i_used(struct nine_shader_info *info, int idx)
 {
     if (info->const_int_slots < (idx + 1))
         info->const_int_slots = idx + 1;
 }
-static INLINE void
+static inline void
 nine_info_mark_const_b_used(struct nine_shader_info *info, int idx)
 {
     if (info->const_bool_slots < (idx + 1))
@@ -100,7 +100,7 @@ struct nine_shader_variant
     uint32_t key;
 };
 
-static INLINE void *
+static inline void *
 nine_shader_variant_get(struct nine_shader_variant *list, uint32_t key)
 {
     while (list->key != key && list->next)
@@ -110,7 +110,7 @@ nine_shader_variant_get(struct nine_shader_variant *list, uint32_t key)
     return NULL;
 }
 
-static INLINE boolean
+static inline boolean
 nine_shader_variant_add(struct nine_shader_variant *list,
                         uint32_t key, void *cso)
 {
@@ -127,7 +127,7 @@ nine_shader_variant_add(struct nine_shader_variant *list,
     return TRUE;
 }
 
-static INLINE void
+static inline void
 nine_shader_variants_free(struct nine_shader_variant *list)
 {
     while (list->next) {
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index 435118bc93f..6c835858d18 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -176,7 +176,7 @@ update_viewport(struct NineDevice9 *device)
     pipe->set_viewport_states(pipe, 0, 1, &pvport);
 }
 
-static INLINE void
+static inline void
 update_scissor(struct NineDevice9 *device)
 {
     struct pipe_context *pipe = device->pipe;
@@ -184,19 +184,19 @@ update_scissor(struct NineDevice9 *device)
     pipe->set_scissor_states(pipe, 0, 1, &device->state.scissor);
 }
 
-static INLINE void
+static inline void
 update_blend(struct NineDevice9 *device)
 {
     nine_convert_blend_state(device->cso, device->state.rs);
 }
 
-static INLINE void
+static inline void
 update_dsa(struct NineDevice9 *device)
 {
     nine_convert_dsa_state(device->cso, device->state.rs);
 }
 
-static INLINE void
+static inline void
 update_rasterizer(struct NineDevice9 *device)
 {
     nine_convert_rasterizer_state(device->cso, device->state.rs);
@@ -294,7 +294,7 @@ update_vertex_elements(struct NineDevice9 *device)
     state->changed.stream_freq = 0;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 update_shader_variant_keys(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
@@ -332,7 +332,7 @@ update_shader_variant_keys(struct NineDevice9 *device)
     return mask;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 update_vs(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
@@ -359,7 +359,7 @@ update_vs(struct NineDevice9 *device)
     return changed_group;
 }
 
-static INLINE uint32_t
+static inline uint32_t
 update_ps(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
@@ -656,7 +656,7 @@ update_vertex_buffers(struct NineDevice9 *device)
     state->changed.vtxbuf = 0;
 }
 
-static INLINE void
+static inline void
 update_index_buffer(struct NineDevice9 *device)
 {
     struct pipe_context *pipe = device->pipe;
@@ -677,7 +677,7 @@ validate_textures(struct NineDevice9 *device)
     }
 }
 
-static INLINE boolean
+static inline boolean
 update_sampler_derived(struct nine_state *state, unsigned s)
 {
     boolean changed = FALSE;
diff --git a/src/gallium/state_trackers/nine/nineexoverlayextension.h b/src/gallium/state_trackers/nine/nineexoverlayextension.h
index a16d690dc8c..1616ed0532c 100644
--- a/src/gallium/state_trackers/nine/nineexoverlayextension.h
+++ b/src/gallium/state_trackers/nine/nineexoverlayextension.h
@@ -29,7 +29,7 @@ struct Nine9ExOverlayExtension
 {
     struct NineUnknown base;
 };
-static INLINE struct Nine9ExOverlayExtension *
+static inline struct Nine9ExOverlayExtension *
 Nine9ExOverlayExtension( void *data )
 {
     return (struct Nine9ExOverlayExtension *)data;
diff --git a/src/gallium/state_trackers/nine/pixelshader9.h b/src/gallium/state_trackers/nine/pixelshader9.h
index 5e2219c946a..6dad1d1ee76 100644
--- a/src/gallium/state_trackers/nine/pixelshader9.h
+++ b/src/gallium/state_trackers/nine/pixelshader9.h
@@ -47,7 +47,7 @@ struct NinePixelShader9
 
     uint64_t ff_key[6];
 };
-static INLINE struct NinePixelShader9 *
+static inline struct NinePixelShader9 *
 NinePixelShader9( void *data )
 {
     return (struct NinePixelShader9 *)data;
diff --git a/src/gallium/state_trackers/nine/query9.c b/src/gallium/state_trackers/nine/query9.c
index 04f4aadabba..3afa9007f61 100644
--- a/src/gallium/state_trackers/nine/query9.c
+++ b/src/gallium/state_trackers/nine/query9.c
@@ -57,7 +57,7 @@ d3dquerytype_to_pipe_query(struct pipe_screen *screen, D3DQUERYTYPE type)
 
 #define GET_DATA_SIZE_CASE2(a, b) case D3DQUERYTYPE_##a: return sizeof(D3DDEVINFO_##b)
 #define GET_DATA_SIZE_CASET(a, b) case D3DQUERYTYPE_##a: return sizeof(b)
-static INLINE DWORD
+static inline DWORD
 nine_query_result_size(D3DQUERYTYPE type)
 {
     switch (type) {
diff --git a/src/gallium/state_trackers/nine/query9.h b/src/gallium/state_trackers/nine/query9.h
index ad1ca50f26d..9cc1e317055 100644
--- a/src/gallium/state_trackers/nine/query9.h
+++ b/src/gallium/state_trackers/nine/query9.h
@@ -41,7 +41,7 @@ struct NineQuery9
     enum nine_query_state state;
     boolean instant; /* true if D3DISSUE_BEGIN is not needed / invalid */
 };
-static INLINE struct NineQuery9 *
+static inline struct NineQuery9 *
 NineQuery9( void *data )
 {
     return (struct NineQuery9 *)data;
diff --git a/src/gallium/state_trackers/nine/resource9.h b/src/gallium/state_trackers/nine/resource9.h
index da1dd6320e0..906f90806ce 100644
--- a/src/gallium/state_trackers/nine/resource9.h
+++ b/src/gallium/state_trackers/nine/resource9.h
@@ -46,7 +46,7 @@ struct NineResource9
     /* for [GS]etPrivateData/FreePrivateData */
     struct util_hash_table *pdata;
 };
-static INLINE struct NineResource9 *
+static inline struct NineResource9 *
 NineResource9( void *data )
 {
     return (struct NineResource9 *)data;
diff --git a/src/gallium/state_trackers/nine/stateblock9.h b/src/gallium/state_trackers/nine/stateblock9.h
index bcaf634d933..a580c6a2302 100644
--- a/src/gallium/state_trackers/nine/stateblock9.h
+++ b/src/gallium/state_trackers/nine/stateblock9.h
@@ -43,7 +43,7 @@ struct NineStateBlock9
 
     enum nine_stateblock_type type;
 };
-static INLINE struct NineStateBlock9 *
+static inline struct NineStateBlock9 *
 NineStateBlock9( void *data )
 {
     return (struct NineStateBlock9 *)data;
diff --git a/src/gallium/state_trackers/nine/surface9.c b/src/gallium/state_trackers/nine/surface9.c
index e46afd91157..7533cb3a454 100644
--- a/src/gallium/state_trackers/nine/surface9.c
+++ b/src/gallium/state_trackers/nine/surface9.c
@@ -261,7 +261,7 @@ NineSurface9_GetDesc( struct NineSurface9 *This,
 }
 
 /* Add the dirty rects to the source texture */
-INLINE void
+inline void
 NineSurface9_AddDirtyRect( struct NineSurface9 *This,
                            const struct pipe_box *box )
 {
@@ -295,7 +295,7 @@ NineSurface9_AddDirtyRect( struct NineSurface9 *This,
     }
 }
 
-static INLINE uint8_t *
+static inline uint8_t *
 NineSurface9_GetSystemMemPointer(struct NineSurface9 *This, int x, int y)
 {
     unsigned x_offset = util_format_get_stride(This->base.info.format, x);
diff --git a/src/gallium/state_trackers/nine/surface9.h b/src/gallium/state_trackers/nine/surface9.h
index 2e409558609..73092ab8cf5 100644
--- a/src/gallium/state_trackers/nine/surface9.h
+++ b/src/gallium/state_trackers/nine/surface9.h
@@ -50,7 +50,7 @@ struct NineSurface9
     uint8_t *data; /* system memory backing */
     unsigned stride; /* for system memory backing */
 };
-static INLINE struct NineSurface9 *
+static inline struct NineSurface9 *
 NineSurface9( void *data )
 {
     return (struct NineSurface9 *)data;
@@ -89,7 +89,7 @@ NineSurface9_MarkContainerDirty( struct NineSurface9 *This );
 struct pipe_surface *
 NineSurface9_CreatePipeSurface( struct NineSurface9 *This, const int sRGB );
 
-static INLINE struct pipe_surface *
+static inline struct pipe_surface *
 NineSurface9_GetSurface( struct NineSurface9 *This, int sRGB )
 {
     if (This->surface[sRGB])
@@ -97,13 +97,13 @@ NineSurface9_GetSurface( struct NineSurface9 *This, int sRGB )
     return NineSurface9_CreatePipeSurface(This, sRGB);
 }
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 NineSurface9_GetResource( struct NineSurface9 *This )
 {
     return This->base.resource;
 }
 
-static INLINE void
+static inline void
 NineSurface9_SetResource( struct NineSurface9 *This,
                           struct pipe_resource *resource, unsigned level )
 {
@@ -131,7 +131,7 @@ NineSurface9_CopySurface( struct NineSurface9 *This,
                           const POINT *pDestPoint,
                           const RECT *pSourceRect );
 
-static INLINE boolean
+static inline boolean
 NineSurface9_IsOffscreenPlain (struct NineSurface9 *This )
 {
     return This->base.usage == 0 && !This->texture;
@@ -141,7 +141,7 @@ NineSurface9_IsOffscreenPlain (struct NineSurface9 *This )
 void
 NineSurface9_Dump( struct NineSurface9 *This );
 #else
-static INLINE void
+static inline void
 NineSurface9_Dump( struct NineSurface9 *This ) { }
 #endif
 
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index c40bc602460..a62e6ad99d8 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -631,7 +631,7 @@ static void pend_present(struct NineSwapChain9 *This,
     return;
 }
 
-static INLINE HRESULT
+static inline HRESULT
 present( struct NineSwapChain9 *This,
          const RECT *pSourceRect,
          const RECT *pDestRect,
@@ -726,7 +726,7 @@ bypass_rendering:
         BOOL still_draw = FALSE;
         fence = swap_fences_see_front(This);
         if (fence) {
-            still_draw = !This->screen->fence_signalled(This->screen, fence);
+            still_draw = !This->screen->fence_finish(This->screen, fence, 0);
             This->screen->fence_reference(This->screen, &fence, NULL);
         }
         if (still_draw)
diff --git a/src/gallium/state_trackers/nine/swapchain9.h b/src/gallium/state_trackers/nine/swapchain9.h
index 2afd6ab2954..5e48dde5004 100644
--- a/src/gallium/state_trackers/nine/swapchain9.h
+++ b/src/gallium/state_trackers/nine/swapchain9.h
@@ -76,7 +76,7 @@ struct NineSwapChain9
     BOOL enable_threadpool;
 };
 
-static INLINE struct NineSwapChain9 *
+static inline struct NineSwapChain9 *
 NineSwapChain9( void *data )
 {
     return (struct NineSwapChain9 *)data;
diff --git a/src/gallium/state_trackers/nine/swapchain9ex.h b/src/gallium/state_trackers/nine/swapchain9ex.h
index bf407836099..075f8835222 100644
--- a/src/gallium/state_trackers/nine/swapchain9ex.h
+++ b/src/gallium/state_trackers/nine/swapchain9ex.h
@@ -29,7 +29,7 @@ struct NineSwapChain9Ex
 {
     struct NineSwapChain9 base;
 };
-static INLINE struct NineSwapChain9Ex *
+static inline struct NineSwapChain9Ex *
 NineSwapChain9Ex( void *data )
 {
     return (struct NineSwapChain9Ex *)data;
diff --git a/src/gallium/state_trackers/nine/texture9.h b/src/gallium/state_trackers/nine/texture9.h
index 65db874b2a3..6f80be9ccde 100644
--- a/src/gallium/state_trackers/nine/texture9.h
+++ b/src/gallium/state_trackers/nine/texture9.h
@@ -33,7 +33,7 @@ struct NineTexture9
     struct pipe_box dirty_rect; /* covers all mip levels */
     uint8_t *managed_buffer;
 };
-static INLINE struct NineTexture9 *
+static inline struct NineTexture9 *
 NineTexture9( void *data )
 {
     return (struct NineTexture9 *)data;
diff --git a/src/gallium/state_trackers/nine/vertexbuffer9.h b/src/gallium/state_trackers/nine/vertexbuffer9.h
index 0d88b839cad..6174de4df08 100644
--- a/src/gallium/state_trackers/nine/vertexbuffer9.h
+++ b/src/gallium/state_trackers/nine/vertexbuffer9.h
@@ -40,7 +40,7 @@ struct NineVertexBuffer9
 
     D3DVERTEXBUFFER_DESC desc;
 };
-static INLINE struct NineVertexBuffer9 *
+static inline struct NineVertexBuffer9 *
 NineVertexBuffer9( void *data )
 {
     return (struct NineVertexBuffer9 *)data;
diff --git a/src/gallium/state_trackers/nine/vertexdeclaration9.c b/src/gallium/state_trackers/nine/vertexdeclaration9.c
index 9e4cb55bc67..2047b91abc4 100644
--- a/src/gallium/state_trackers/nine/vertexdeclaration9.c
+++ b/src/gallium/state_trackers/nine/vertexdeclaration9.c
@@ -34,7 +34,7 @@
 
 #define DBG_CHANNEL DBG_VERTEXDECLARATION
 
-static INLINE enum pipe_format decltype_format(BYTE type)
+static inline enum pipe_format decltype_format(BYTE type)
 {
     switch (type) {
     case D3DDECLTYPE_FLOAT1:    return PIPE_FORMAT_R32_FLOAT;
@@ -60,7 +60,7 @@ static INLINE enum pipe_format decltype_format(BYTE type)
     return PIPE_FORMAT_NONE;
 }
 
-static INLINE unsigned decltype_size(BYTE type)
+static inline unsigned decltype_size(BYTE type)
 {
     switch (type) {
     case D3DDECLTYPE_FLOAT1: return 1 * sizeof(float);
@@ -90,7 +90,7 @@ static INLINE unsigned decltype_size(BYTE type)
  * simple lookup table won't work in that case. Let's just wait
  * with making this more generic until we need it.
  */
-static INLINE boolean
+static inline boolean
 nine_d3ddeclusage_check(unsigned usage, unsigned usage_idx)
 {
     switch (usage) {
@@ -162,7 +162,7 @@ static const char *nine_declusage_names[] =
     [NINE_DECLUSAGE_FOG]             = "FOG",
     [NINE_DECLUSAGE_NONE]            = "(NONE)",
 };
-static INLINE const char *
+static inline const char *
 nine_declusage_name(unsigned ndcl)
 {
     return nine_declusage_names[ndcl % NINE_DECLUSAGE_COUNT];
diff --git a/src/gallium/state_trackers/nine/vertexdeclaration9.h b/src/gallium/state_trackers/nine/vertexdeclaration9.h
index a4d4a0445d5..655bcfbf165 100644
--- a/src/gallium/state_trackers/nine/vertexdeclaration9.h
+++ b/src/gallium/state_trackers/nine/vertexdeclaration9.h
@@ -47,7 +47,7 @@ struct NineVertexDeclaration9
     D3DVERTEXELEMENT9 *decls;
     DWORD fvf;
 };
-static INLINE struct NineVertexDeclaration9 *
+static inline struct NineVertexDeclaration9 *
 NineVertexDeclaration9( void *data )
 {
     return (struct NineVertexDeclaration9 *)data;
diff --git a/src/gallium/state_trackers/nine/vertexshader9.h b/src/gallium/state_trackers/nine/vertexshader9.h
index 3495c9f9c55..66c602c7b3c 100644
--- a/src/gallium/state_trackers/nine/vertexshader9.h
+++ b/src/gallium/state_trackers/nine/vertexshader9.h
@@ -56,7 +56,7 @@ struct NineVertexShader9
 
     uint64_t ff_key[2];
 };
-static INLINE struct NineVertexShader9 *
+static inline struct NineVertexShader9 *
 NineVertexShader9( void *data )
 {
     return (struct NineVertexShader9 *)data;
diff --git a/src/gallium/state_trackers/nine/volume9.c b/src/gallium/state_trackers/nine/volume9.c
index b34ee07dce9..4dfc5599a8e 100644
--- a/src/gallium/state_trackers/nine/volume9.c
+++ b/src/gallium/state_trackers/nine/volume9.c
@@ -152,7 +152,7 @@ NineVolume9_GetContainer( struct NineVolume9 *This,
     return NineUnknown_QueryInterface(NineUnknown(This)->container, riid, ppContainer);
 }
 
-static INLINE void
+static inline void
 NineVolume9_MarkContainerDirty( struct NineVolume9 *This )
 {
     struct NineBaseTexture9 *tex;
@@ -182,13 +182,13 @@ NineVolume9_GetDesc( struct NineVolume9 *This,
     return D3D_OK;
 }
 
-static INLINE boolean
+static inline boolean
 NineVolume9_IsDirty(struct NineVolume9 *This)
 {
     return This->dirty_box[0].width != 0;
 }
 
-INLINE void
+inline void
 NineVolume9_AddDirtyRegion( struct NineVolume9 *This,
                             const struct pipe_box *box )
 {
@@ -226,7 +226,7 @@ NineVolume9_AddDirtyRegion( struct NineVolume9 *This,
     }
 }
 
-static INLINE uint8_t *
+static inline uint8_t *
 NineVolume9_GetSystemMemPointer(struct NineVolume9 *This, int x, int y, int z)
 {
     unsigned x_offset = util_format_get_stride(This->info.format, x);
diff --git a/src/gallium/state_trackers/nine/volume9.h b/src/gallium/state_trackers/nine/volume9.h
index 802836659c2..fae24310a50 100644
--- a/src/gallium/state_trackers/nine/volume9.h
+++ b/src/gallium/state_trackers/nine/volume9.h
@@ -57,7 +57,7 @@ struct NineVolume9
     /* for [GS]etPrivateData/FreePrivateData */
     struct util_hash_table *pdata;
 };
-static INLINE struct NineVolume9 *
+static inline struct NineVolume9 *
 NineVolume9( void *data )
 {
     return (struct NineVolume9 *)data;
@@ -73,7 +73,7 @@ NineVolume9_new( struct NineDevice9 *pDevice,
 
 /*** Nine private ***/
 
-static INLINE void
+static inline void
 NineVolume9_SetResource( struct NineVolume9 *This,
                          struct pipe_resource *resource, unsigned level )
 {
@@ -85,7 +85,7 @@ void
 NineVolume9_AddDirtyRegion( struct NineVolume9 *This,
                             const struct pipe_box *box );
 
-static INLINE void
+static inline void
 NineVolume9_ClearDirtyRegion( struct NineVolume9 *This )
 {
     memset(&This->dirty_box, 0, sizeof(This->dirty_box));
diff --git a/src/gallium/state_trackers/nine/volumetexture9.h b/src/gallium/state_trackers/nine/volumetexture9.h
index 313fa1a91fb..b8f250ad72e 100644
--- a/src/gallium/state_trackers/nine/volumetexture9.h
+++ b/src/gallium/state_trackers/nine/volumetexture9.h
@@ -32,7 +32,7 @@ struct NineVolumeTexture9
     struct NineVolume9 **volumes;
     struct pipe_box dirty_box;
 };
-static INLINE struct NineVolumeTexture9 *
+static inline struct NineVolumeTexture9 *
 NineVolumeTexture9( void *data )
 {
     return (struct NineVolumeTexture9 *)data;
diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c
index ae1a98f5be3..2bd0194189f 100644
--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -180,6 +180,11 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
                                 PIPE_VIDEO_ENTRYPOINT_ENCODE, PIPE_VIDEO_CAP_SUPPORTED))
       return OMX_ErrorBadParameter;
  
+   priv->stacked_frames_num = screen->get_video_param(screen,
+                                PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
+                                PIPE_VIDEO_ENTRYPOINT_ENCODE,
+                                PIPE_VIDEO_CAP_STACKED_FRAMES);
+
    priv->s_pipe = screen->context_create(screen, priv->screen);
    if (!priv->s_pipe)
       return OMX_ErrorInsufficientResources;
@@ -259,6 +264,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
    LIST_INITHEAD(&priv->free_tasks);
    LIST_INITHEAD(&priv->used_tasks);
    LIST_INITHEAD(&priv->b_frames);
+   LIST_INITHEAD(&priv->stacked_tasks);
 
    return OMX_ErrorNone;
 }
@@ -271,6 +277,7 @@ static OMX_ERRORTYPE vid_enc_Destructor(OMX_COMPONENTTYPE *comp)
    enc_ReleaseTasks(&priv->free_tasks);
    enc_ReleaseTasks(&priv->used_tasks);
    enc_ReleaseTasks(&priv->b_frames);
+   enc_ReleaseTasks(&priv->stacked_tasks);
 
    if (priv->ports) {
       for (i = 0; i < priv->sPortTypesParam[OMX_PortDomainVideo].nPorts; ++i) {
@@ -1116,6 +1123,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
    struct input_buf_private *inp = buf->pInputPortPrivate;
    enum pipe_h264_enc_picture_type picture_type;
    struct encode_task *task;
+   unsigned stacked_num = 0;
    OMX_ERRORTYPE err;
 
    enc_MoveTasks(&inp->tasks, &priv->free_tasks);
@@ -1127,6 +1135,8 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
       if (buf->nFlags & OMX_BUFFERFLAG_EOS) {
          buf->nFilledLen = buf->nAllocLen;
          enc_ClearBframes(port, inp);
+         enc_MoveTasks(&priv->stacked_tasks, &inp->tasks);
+         priv->codec->flush(priv->codec);
       }
       return base_port_SendBufferFunction(port, buf);
    }
@@ -1166,7 +1176,16 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
       /* handle I or P frame */
       priv->ref_idx_l0 = priv->ref_idx_l1;
       enc_HandleTask(port, task, picture_type);
-      LIST_ADDTAIL(&task->list, &inp->tasks);
+      LIST_ADDTAIL(&task->list, &priv->stacked_tasks);
+      LIST_FOR_EACH_ENTRY(task, &priv->stacked_tasks, list) {
+         ++stacked_num;
+      }
+      if (stacked_num == priv->stacked_frames_num) {
+         struct encode_task *t;
+         t = LIST_ENTRY(struct encode_task, priv->stacked_tasks.next, list);
+         LIST_DEL(&t->list);
+         LIST_ADDTAIL(&t->list, &inp->tasks);
+      }
       priv->ref_idx_l1 = priv->frame_num++;
 
       /* handle B frames */
diff --git a/src/gallium/state_trackers/omx/vid_enc.h b/src/gallium/state_trackers/omx/vid_enc.h
index c8d192b9c60..a83374450b5 100644
--- a/src/gallium/state_trackers/omx/vid_enc.h
+++ b/src/gallium/state_trackers/omx/vid_enc.h
@@ -73,6 +73,7 @@ DERIVEDCLASS(vid_enc_PrivateType, omx_base_filter_PrivateType)
 	struct list_head free_tasks; \
 	struct list_head used_tasks; \
 	struct list_head b_frames; \
+	struct list_head stacked_tasks; \
 	OMX_U32 frame_rate; \
 	OMX_U32 frame_num; \
 	OMX_U32 pic_order_cnt; \
@@ -86,7 +87,8 @@ DERIVEDCLASS(vid_enc_PrivateType, omx_base_filter_PrivateType)
 	struct vl_compositor_state cstate; \
 	struct pipe_video_buffer *scale_buffer[OMX_VID_ENC_NUM_SCALING_BUFFERS]; \
 	OMX_CONFIG_SCALEFACTORTYPE scale; \
-	OMX_U32 current_scale_buffer;
+	OMX_U32 current_scale_buffer; \
+	OMX_U32 stacked_frames_num;
 ENDCLASS(vid_enc_PrivateType)
 
 OMX_ERRORTYPE vid_enc_LoaderComponent(stLoaderComponentType *comp);
diff --git a/src/gallium/state_trackers/osmesa/osmesa.c b/src/gallium/state_trackers/osmesa/osmesa.c
index 2d5d096d8ed..0285cb0dac2 100644
--- a/src/gallium/state_trackers/osmesa/osmesa.c
+++ b/src/gallium/state_trackers/osmesa/osmesa.c
@@ -168,7 +168,7 @@ get_st_manager(void)
 }
 
 
-static INLINE boolean
+static inline boolean
 little_endian(void)
 {
    const unsigned ui = 1;
@@ -292,7 +292,7 @@ osmesa_init_st_visual(struct st_visual *vis,
 /**
  * Return the osmesa_buffer that corresponds to an st_framebuffer_iface.
  */
-static INLINE struct osmesa_buffer *
+static inline struct osmesa_buffer *
 stfbi_to_osbuffer(struct st_framebuffer_iface *stfbi)
 {
    return (struct osmesa_buffer *) stfbi->st_manager_private;
@@ -886,7 +886,7 @@ static struct name_function functions[] = {
    { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
    { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
    { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
-   { "OSMesaPixelsStore", (OSMESAproc) OSMesaPixelStore },
+   { "OSMesaPixelStore", (OSMESAproc) OSMesaPixelStore },
    { "OSMesaGetIntegerv", (OSMESAproc) OSMesaGetIntegerv },
    { "OSMesaGetDepthBuffer", (OSMESAproc) OSMesaGetDepthBuffer },
    { "OSMesaGetColorBuffer", (OSMESAproc) OSMesaGetColorBuffer },
diff --git a/src/gallium/state_trackers/vdpau/decode.c b/src/gallium/state_trackers/vdpau/decode.c
index 0634ba72fda..3233799d650 100644
--- a/src/gallium/state_trackers/vdpau/decode.c
+++ b/src/gallium/state_trackers/vdpau/decode.c
@@ -413,6 +413,115 @@ vlVdpDecoderRenderH264(struct pipe_h264_picture_desc *picture,
    return VDP_STATUS_OK;
 }
 
+static VdpStatus
+vlVdpDecoderRenderH265(struct pipe_h265_picture_desc *picture,
+                       VdpPictureInfoHEVC *picture_info)
+{
+   unsigned i;
+
+   picture->pps->sps->chroma_format_idc = picture_info->chroma_format_idc;
+   picture->pps->sps->separate_colour_plane_flag = picture_info->separate_colour_plane_flag;
+   picture->pps->sps->pic_width_in_luma_samples = picture_info->pic_width_in_luma_samples;
+   picture->pps->sps->pic_height_in_luma_samples = picture_info->pic_height_in_luma_samples;
+   picture->pps->sps->bit_depth_luma_minus8 = picture_info->bit_depth_luma_minus8;
+   picture->pps->sps->bit_depth_chroma_minus8 = picture_info->bit_depth_chroma_minus8;
+   picture->pps->sps->log2_max_pic_order_cnt_lsb_minus4 = picture_info->log2_max_pic_order_cnt_lsb_minus4;
+   picture->pps->sps->sps_max_dec_pic_buffering_minus1 = picture_info->sps_max_dec_pic_buffering_minus1;
+   picture->pps->sps->log2_min_luma_coding_block_size_minus3 = picture_info->log2_min_luma_coding_block_size_minus3;
+   picture->pps->sps->log2_diff_max_min_luma_coding_block_size = picture_info->log2_diff_max_min_luma_coding_block_size;
+   picture->pps->sps->log2_min_transform_block_size_minus2 = picture_info->log2_min_transform_block_size_minus2;
+   picture->pps->sps->log2_diff_max_min_transform_block_size = picture_info->log2_diff_max_min_transform_block_size;
+   picture->pps->sps->max_transform_hierarchy_depth_inter = picture_info->max_transform_hierarchy_depth_inter;
+   picture->pps->sps->max_transform_hierarchy_depth_intra = picture_info->max_transform_hierarchy_depth_intra;
+   picture->pps->sps->scaling_list_enabled_flag = picture_info->scaling_list_enabled_flag;
+   memcpy(picture->pps->sps->ScalingList4x4, picture_info->ScalingList4x4, 6*16);
+   memcpy(picture->pps->sps->ScalingList8x8, picture_info->ScalingList8x8, 6*64);
+   memcpy(picture->pps->sps->ScalingList16x16, picture_info->ScalingList16x16, 6*64);
+   memcpy(picture->pps->sps->ScalingList32x32, picture_info->ScalingList32x32, 2*64);
+   memcpy(picture->pps->sps->ScalingListDCCoeff16x16, picture_info->ScalingListDCCoeff16x16, 6);
+   memcpy(picture->pps->sps->ScalingListDCCoeff32x32, picture_info->ScalingListDCCoeff32x32, 2);
+   picture->pps->sps->amp_enabled_flag = picture_info->amp_enabled_flag;
+   picture->pps->sps->sample_adaptive_offset_enabled_flag = picture_info->sample_adaptive_offset_enabled_flag;
+   picture->pps->sps->pcm_enabled_flag = picture_info->pcm_enabled_flag;
+   picture->pps->sps->pcm_sample_bit_depth_luma_minus1 = picture_info->pcm_sample_bit_depth_luma_minus1;
+   picture->pps->sps->pcm_sample_bit_depth_chroma_minus1 = picture_info->pcm_sample_bit_depth_chroma_minus1;
+   picture->pps->sps->log2_min_pcm_luma_coding_block_size_minus3 = picture_info->log2_min_pcm_luma_coding_block_size_minus3;
+   picture->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size = picture_info->log2_diff_max_min_pcm_luma_coding_block_size;
+   picture->pps->sps->pcm_loop_filter_disabled_flag = picture_info->pcm_loop_filter_disabled_flag;
+   picture->pps->sps->num_short_term_ref_pic_sets = picture_info->num_short_term_ref_pic_sets;
+   picture->pps->sps->long_term_ref_pics_present_flag = picture_info->long_term_ref_pics_present_flag;
+   picture->pps->sps->num_long_term_ref_pics_sps = picture_info->num_long_term_ref_pics_sps;
+   picture->pps->sps->sps_temporal_mvp_enabled_flag = picture_info->sps_temporal_mvp_enabled_flag;
+   picture->pps->sps->strong_intra_smoothing_enabled_flag = picture_info->strong_intra_smoothing_enabled_flag;
+
+   picture->pps->dependent_slice_segments_enabled_flag = picture_info->dependent_slice_segments_enabled_flag;
+   picture->pps->output_flag_present_flag = picture_info->output_flag_present_flag;
+   picture->pps->num_extra_slice_header_bits = picture_info->num_extra_slice_header_bits;
+   picture->pps->sign_data_hiding_enabled_flag = picture_info->sign_data_hiding_enabled_flag;
+   picture->pps->cabac_init_present_flag = picture_info->cabac_init_present_flag;
+   picture->pps->num_ref_idx_l0_default_active_minus1 = picture_info->num_ref_idx_l0_default_active_minus1;
+   picture->pps->num_ref_idx_l1_default_active_minus1 = picture_info->num_ref_idx_l1_default_active_minus1;
+   picture->pps->init_qp_minus26 = picture_info->init_qp_minus26;
+   picture->pps->constrained_intra_pred_flag = picture_info->constrained_intra_pred_flag;
+   picture->pps->transform_skip_enabled_flag = picture_info->transform_skip_enabled_flag;
+   picture->pps->cu_qp_delta_enabled_flag = picture_info->cu_qp_delta_enabled_flag;
+   picture->pps->diff_cu_qp_delta_depth = picture_info->diff_cu_qp_delta_depth;
+   picture->pps->pps_cb_qp_offset = picture_info->pps_cb_qp_offset;
+   picture->pps->pps_cr_qp_offset = picture_info->pps_cr_qp_offset;
+   picture->pps->pps_slice_chroma_qp_offsets_present_flag = picture_info->pps_slice_chroma_qp_offsets_present_flag;
+   picture->pps->weighted_pred_flag = picture_info->weighted_pred_flag;
+   picture->pps->weighted_bipred_flag = picture_info->weighted_bipred_flag;
+   picture->pps->transquant_bypass_enabled_flag = picture_info->transquant_bypass_enabled_flag;
+   picture->pps->tiles_enabled_flag = picture_info->tiles_enabled_flag;
+   picture->pps->entropy_coding_sync_enabled_flag = picture_info->entropy_coding_sync_enabled_flag;
+   picture->pps->num_tile_columns_minus1 = picture_info->num_tile_columns_minus1;
+   picture->pps->num_tile_rows_minus1 = picture_info->num_tile_rows_minus1;
+   picture->pps->uniform_spacing_flag = picture_info->uniform_spacing_flag;
+   memcpy(picture->pps->column_width_minus1, picture_info->column_width_minus1, 20 * 2);
+   memcpy(picture->pps->row_height_minus1, picture_info->row_height_minus1, 22 * 2);
+   picture->pps->loop_filter_across_tiles_enabled_flag = picture_info->loop_filter_across_tiles_enabled_flag;
+   picture->pps->pps_loop_filter_across_slices_enabled_flag = picture_info->pps_loop_filter_across_slices_enabled_flag;
+   picture->pps->deblocking_filter_control_present_flag = picture_info->deblocking_filter_control_present_flag;
+   picture->pps->deblocking_filter_override_enabled_flag = picture_info->deblocking_filter_override_enabled_flag;
+   picture->pps->pps_deblocking_filter_disabled_flag = picture_info->pps_deblocking_filter_disabled_flag;
+   picture->pps->pps_beta_offset_div2 = picture_info->pps_beta_offset_div2;
+   picture->pps->pps_tc_offset_div2 = picture_info->pps_tc_offset_div2;
+   picture->pps->lists_modification_present_flag = picture_info->lists_modification_present_flag;
+   picture->pps->log2_parallel_merge_level_minus2 = picture_info->log2_parallel_merge_level_minus2;
+   picture->pps->slice_segment_header_extension_present_flag = picture_info->slice_segment_header_extension_present_flag;
+
+   picture->IDRPicFlag = picture_info->IDRPicFlag;
+   picture->RAPPicFlag = picture_info->RAPPicFlag;
+   picture->CurrRpsIdx = picture_info->CurrRpsIdx;
+   picture->NumPocTotalCurr = picture_info->NumPocTotalCurr;
+   picture->NumDeltaPocsOfRefRpsIdx = picture_info->NumDeltaPocsOfRefRpsIdx;
+   picture->NumShortTermPictureSliceHeaderBits = picture_info->NumShortTermPictureSliceHeaderBits;
+   picture->NumLongTermPictureSliceHeaderBits = picture_info->NumLongTermPictureSliceHeaderBits;
+   picture->CurrPicOrderCntVal = picture_info->CurrPicOrderCntVal;
+
+   for (i = 0; i < 16; ++i) {
+      VdpStatus ret = vlVdpGetReferenceFrame
+      (
+         picture_info->RefPics[i],
+         &picture->ref[i]
+      );
+      if (ret != VDP_STATUS_OK)
+         return ret;
+
+      picture->PicOrderCntVal[i] = picture_info->PicOrderCntVal[i];
+      picture->IsLongTerm[i] = picture_info->IsLongTerm[i];
+   }
+
+   picture->NumPocStCurrBefore = picture_info->NumPocStCurrBefore;
+   picture->NumPocStCurrAfter = picture_info->NumPocStCurrAfter;
+   picture->NumPocLtCurr = picture_info->NumPocLtCurr;
+   memcpy(picture->RefPicSetStCurrBefore, picture_info->RefPicSetStCurrBefore, 8);
+   memcpy(picture->RefPicSetStCurrAfter, picture_info->RefPicSetStCurrAfter, 8);
+   memcpy(picture->RefPicSetLtCurr, picture_info->RefPicSetLtCurr, 8);
+
+   return VDP_STATUS_OK;
+}
+
 static void
 vlVdpDecoderFixVC1Startcode(uint32_t *num_buffers, const void *buffers[], unsigned sizes[])
 {
@@ -461,14 +570,17 @@ vlVdpDecoderRender(VdpDecoder decoder,
    struct pipe_video_codec *dec;
    bool buffer_support[2];
    unsigned i;
-   struct pipe_h264_sps sps = {};
-   struct pipe_h264_pps pps = { &sps };
+   struct pipe_h264_sps sps_h264 = {};
+   struct pipe_h264_pps pps_h264 = { &sps_h264 };
+   struct pipe_h265_sps sps_h265 = {};
+   struct pipe_h265_pps pps_h265 = { &sps_h265 };
    union {
       struct pipe_picture_desc base;
       struct pipe_mpeg12_picture_desc mpeg12;
       struct pipe_mpeg4_picture_desc mpeg4;
       struct pipe_vc1_picture_desc vc1;
       struct pipe_h264_picture_desc h264;
+      struct pipe_h265_picture_desc h265;
    } desc;
 
    if (!(picture_info && bitstream_buffers))
@@ -547,9 +659,13 @@ vlVdpDecoderRender(VdpDecoder decoder,
       ret = vlVdpDecoderRenderVC1(&desc.vc1, (VdpPictureInfoVC1 *)picture_info);
       break;
    case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-      desc.h264.pps = &pps;
+      desc.h264.pps = &pps_h264;
       ret = vlVdpDecoderRenderH264(&desc.h264, (VdpPictureInfoH264 *)picture_info);
       break;
+   case PIPE_VIDEO_FORMAT_HEVC:
+      desc.h265.pps = &pps_h265;
+      ret = vlVdpDecoderRenderH265(&desc.h265, (VdpPictureInfoHEVC *)picture_info);
+      break;
    default:
       return VDP_STATUS_INVALID_DECODER_PROFILE;
    }
diff --git a/src/gallium/state_trackers/vdpau/mixer.c b/src/gallium/state_trackers/vdpau/mixer.c
index 4118eb86997..c0b1ecc55fa 100644
--- a/src/gallium/state_trackers/vdpau/mixer.c
+++ b/src/gallium/state_trackers/vdpau/mixer.c
@@ -49,7 +49,8 @@ vlVdpVideoMixerCreate(VdpDevice device,
    vlVdpVideoMixer *vmixer = NULL;
    VdpStatus ret;
    struct pipe_screen *screen;
-   unsigned max_width, max_height, i;
+   uint32_t max_2d_texture_level;
+   unsigned max_size, i;
 
    vlVdpDevice *dev = vlGetDataHTAB(device);
    if (!dev)
@@ -134,18 +135,17 @@ vlVdpVideoMixerCreate(VdpDevice device,
       VDPAU_MSG(VDPAU_WARN, "[VDPAU] Max layers > 4 not supported\n", vmixer->max_layers);
       goto no_params;
    }
-   max_width = screen->get_video_param(screen, PIPE_VIDEO_PROFILE_UNKNOWN,
-                                       PIPE_VIDEO_ENTRYPOINT_BITSTREAM, PIPE_VIDEO_CAP_MAX_WIDTH);
-   max_height = screen->get_video_param(screen, PIPE_VIDEO_PROFILE_UNKNOWN,
-                                        PIPE_VIDEO_ENTRYPOINT_BITSTREAM, PIPE_VIDEO_CAP_MAX_HEIGHT);
-   if (vmixer->video_width < 48 ||
-       vmixer->video_width > max_width) {
-      VDPAU_MSG(VDPAU_WARN, "[VDPAU] 48 < %u < %u not valid for width\n", vmixer->video_width, max_width);
+
+   max_2d_texture_level = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   max_size = pow(2, max_2d_texture_level-1);
+   if (vmixer->video_width < 48 || vmixer->video_width > max_size) {
+      VDPAU_MSG(VDPAU_WARN, "[VDPAU] 48 < %u < %u not valid for width\n",
+                vmixer->video_width, max_size);
       goto no_params;
    }
-   if (vmixer->video_height < 48 ||
-       vmixer->video_height > max_height) {
-      VDPAU_MSG(VDPAU_WARN, "[VDPAU] 48 < %u < %u  not valid for height\n", vmixer->video_height, max_height);
+   if (vmixer->video_height < 48 || vmixer->video_height > max_size) {
+      VDPAU_MSG(VDPAU_WARN, "[VDPAU] 48 < %u < %u  not valid for height\n",
+                vmixer->video_height, max_size);
       goto no_params;
    }
    vmixer->luma_key_min = 0.f;
diff --git a/src/gallium/state_trackers/vdpau/presentation.c b/src/gallium/state_trackers/vdpau/presentation.c
index 7f8dbed7ee2..e53303708b2 100644
--- a/src/gallium/state_trackers/vdpau/presentation.c
+++ b/src/gallium/state_trackers/vdpau/presentation.c
@@ -369,7 +369,7 @@ vlVdpPresentationQueueQuerySurfaceStatus(VdpPresentationQueue presentation_queue
    } else {
       pipe_mutex_lock(pq->device->mutex);
       screen = pq->device->vscreen->pscreen;
-      if (screen->fence_signalled(screen, surf->fence)) {
+      if (screen->fence_finish(screen, surf->fence, 0)) {
          screen->fence_reference(screen, &surf->fence, NULL);
          *status = VDP_PRESENTATION_QUEUE_STATUS_VISIBLE;
          pipe_mutex_unlock(pq->device->mutex);
diff --git a/src/gallium/state_trackers/vdpau/vdpau_private.h b/src/gallium/state_trackers/vdpau/vdpau_private.h
index e14ce041947..27ac44cd9c1 100644
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -261,6 +261,16 @@ ProfileToPipe(VdpDecoderProfile vdpau_profile)
          return PIPE_VIDEO_PROFILE_VC1_MAIN;
       case VDP_DECODER_PROFILE_VC1_ADVANCED:
          return PIPE_VIDEO_PROFILE_VC1_ADVANCED;
+      case VDP_DECODER_PROFILE_HEVC_MAIN:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN;
+      case VDP_DECODER_PROFILE_HEVC_MAIN_10:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN_10;
+      case VDP_DECODER_PROFILE_HEVC_MAIN_STILL:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL;
+      case VDP_DECODER_PROFILE_HEVC_MAIN_12:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN_12;
+      case VDP_DECODER_PROFILE_HEVC_MAIN_444:
+         return PIPE_VIDEO_PROFILE_HEVC_MAIN_444;
       default:
          return PIPE_VIDEO_PROFILE_UNKNOWN;
    }
@@ -292,6 +302,16 @@ PipeToProfile(enum pipe_video_profile p_profile)
          return VDP_DECODER_PROFILE_VC1_MAIN;
       case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
          return VDP_DECODER_PROFILE_VC1_ADVANCED;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+         return VDP_DECODER_PROFILE_HEVC_MAIN;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+         return VDP_DECODER_PROFILE_HEVC_MAIN_10;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL:
+         return VDP_DECODER_PROFILE_HEVC_MAIN_STILL;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_12:
+         return VDP_DECODER_PROFILE_HEVC_MAIN_12;
+      case PIPE_VIDEO_PROFILE_HEVC_MAIN_444:
+         return VDP_DECODER_PROFILE_HEVC_MAIN_444;
       default:
          assert(0);
          return -1;
diff --git a/src/gallium/state_trackers/wgl/Makefile.sources b/src/gallium/state_trackers/wgl/Makefile.sources
index 8c463d5f18e..1e00caf97b7 100644
--- a/src/gallium/state_trackers/wgl/Makefile.sources
+++ b/src/gallium/state_trackers/wgl/Makefile.sources
@@ -9,6 +9,7 @@ C_SOURCES := \
 	stw_framebuffer.c \
 	stw_getprocaddress.c \
 	stw_nopfuncs.c \
+	stw_nopfuncs.h \
 	stw_pixelformat.c \
 	stw_st.c \
 	stw_tls.c \
diff --git a/src/gallium/state_trackers/wgl/stw_device.h b/src/gallium/state_trackers/wgl/stw_device.h
index 4a930b5bef8..e35a4b94036 100644
--- a/src/gallium/state_trackers/wgl/stw_device.h
+++ b/src/gallium/state_trackers/wgl/stw_device.h
@@ -80,7 +80,7 @@ struct stw_device
 extern struct stw_device *stw_dev;
 
 
-static INLINE struct stw_context *
+static inline struct stw_context *
 stw_lookup_context_locked( DHGLRC dhglrc )
 {
    if (dhglrc == 0 || stw_dev == NULL)
diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c
index 2b81b820495..7b34fcbb5ed 100644
--- a/src/gallium/state_trackers/wgl/stw_framebuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c
@@ -45,7 +45,7 @@
  * Search the framebuffer with the matching HWND while holding the
  * stw_dev::fb_mutex global lock.
  */
-static INLINE struct stw_framebuffer *
+static inline struct stw_framebuffer *
 stw_framebuffer_from_hwnd_locked(
    HWND hwnd )
 {
@@ -376,7 +376,7 @@ stw_framebuffer_cleanup(void)
 /**
  * Given an hdc, return the corresponding stw_framebuffer.
  */
-static INLINE struct stw_framebuffer *
+static inline struct stw_framebuffer *
 stw_framebuffer_from_hdc_locked(
    HDC hdc )
 {
diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c
index 0a9116cbb73..b41171a9195 100644
--- a/src/gallium/state_trackers/wgl/stw_st.c
+++ b/src/gallium/state_trackers/wgl/stw_st.c
@@ -46,7 +46,7 @@ struct stw_st_framebuffer {
    unsigned texture_mask;
 };
 
-static INLINE struct stw_st_framebuffer *
+static inline struct stw_st_framebuffer *
 stw_st_framebuffer(struct st_framebuffer_iface *stfb)
 {
    return (struct stw_st_framebuffer *) stfb;
diff --git a/src/gallium/state_trackers/wgl/stw_tls.c b/src/gallium/state_trackers/wgl/stw_tls.c
index ca27a53433c..041066f5007 100644
--- a/src/gallium/state_trackers/wgl/stw_tls.c
+++ b/src/gallium/state_trackers/wgl/stw_tls.c
@@ -50,7 +50,7 @@ static CRITICAL_SECTION g_mutex = {
 static struct stw_tls_data *g_pendingTlsData = NULL;
 
 
-static INLINE struct stw_tls_data *
+static inline struct stw_tls_data *
 stw_tls_data_create(DWORD dwThreadId);
 
 static struct stw_tls_data *
@@ -111,7 +111,7 @@ stw_tls_init(void)
 /**
  * Install windows hook for a given thread (not necessarily the current one).
  */
-static INLINE struct stw_tls_data *
+static inline struct stw_tls_data *
 stw_tls_data_create(DWORD dwThreadId)
 {
    struct stw_tls_data *data;
diff --git a/src/gallium/state_trackers/xa/xa_composite.c b/src/gallium/state_trackers/xa/xa_composite.c
index c283a0d1892..7cfd1e136d1 100644
--- a/src/gallium/state_trackers/xa/xa_composite.c
+++ b/src/gallium/state_trackers/xa/xa_composite.c
@@ -167,7 +167,7 @@ blend_for_op(struct xa_composite_blend *blend,
 }
 
 
-static INLINE int
+static inline int
 xa_repeat_to_gallium(int mode)
 {
     switch(mode) {
@@ -185,7 +185,7 @@ xa_repeat_to_gallium(int mode)
     return PIPE_TEX_WRAP_REPEAT;
 }
 
-static INLINE boolean
+static inline boolean
 xa_filter_to_gallium(int xrender_filter, int *out_filter)
 {
 
diff --git a/src/gallium/state_trackers/xa/xa_context.c b/src/gallium/state_trackers/xa/xa_context.c
index fd49c82a559..ebfb290af13 100644
--- a/src/gallium/state_trackers/xa/xa_context.c
+++ b/src/gallium/state_trackers/xa/xa_context.c
@@ -37,7 +37,11 @@
 XA_EXPORT void
 xa_context_flush(struct xa_context *ctx)
 {
-	ctx->pipe->flush(ctx->pipe, &ctx->last_fence, 0);
+    if (ctx->last_fence) {
+        struct pipe_screen *screen = ctx->xa->screen;
+        screen->fence_reference(screen, &ctx->last_fence, NULL);
+    }
+    ctx->pipe->flush(ctx->pipe, &ctx->last_fence, 0);
 }
 
 XA_EXPORT struct xa_context *
diff --git a/src/gallium/state_trackers/xa/xa_priv.h b/src/gallium/state_trackers/xa/xa_priv.h
index f71c06c6c19..13a0e86f66d 100644
--- a/src/gallium/state_trackers/xa/xa_priv.h
+++ b/src/gallium/state_trackers/xa/xa_priv.h
@@ -123,7 +123,7 @@ struct xa_context {
     const struct xa_composite *comp;
 };
 
-static INLINE void
+static inline void
 xa_scissor_reset(struct xa_context *ctx)
 {
     ctx->scissor.maxx = 0;
@@ -133,7 +133,7 @@ xa_scissor_reset(struct xa_context *ctx)
     ctx->scissor_valid = FALSE;
 }
 
-static INLINE void
+static inline void
 xa_scissor_update(struct xa_context *ctx, unsigned minx, unsigned miny,
 		unsigned maxx, unsigned maxy)
 {
@@ -189,13 +189,13 @@ struct xa_shaders;
  * Inline utilities
  */
 
-static INLINE int
+static inline int
 xa_min(int a, int b)
 {
     return ((a <= b) ? a : b);
 }
 
-static INLINE void
+static inline void
 xa_pixel_to_float4(uint32_t pixel, float *color)
 {
     uint32_t	    r, g, b, a;
@@ -210,7 +210,7 @@ xa_pixel_to_float4(uint32_t pixel, float *color)
     color[3] = ((float)a) / 255.;
 }
 
-static INLINE void
+static inline void
 xa_pixel_to_float4_a8(uint32_t pixel, float *color)
 {
     uint32_t a;
diff --git a/src/gallium/state_trackers/xa/xa_renderer.c b/src/gallium/state_trackers/xa/xa_renderer.c
index 7b28afc907f..fda07e5b68e 100644
--- a/src/gallium/state_trackers/xa/xa_renderer.c
+++ b/src/gallium/state_trackers/xa/xa_renderer.c
@@ -45,14 +45,14 @@ void
 renderer_set_constants(struct xa_context *r,
 		       int shader_type, const float *params, int param_bytes);
 
-static INLINE boolean
+static inline boolean
 is_affine(float *matrix)
 {
     return floatIsZero(matrix[2]) && floatIsZero(matrix[5])
 	&& floatsEqual(matrix[8], 1);
 }
 
-static INLINE void
+static inline void
 map_point(float *mat, float x, float y, float *out_x, float *out_y)
 {
     if (!mat) {
@@ -71,7 +71,7 @@ map_point(float *mat, float x, float y, float *out_x, float *out_y)
     }
 }
 
-static INLINE void
+static inline void
 renderer_draw(struct xa_context *r)
 {
     int num_verts = r->buffer_size / (r->attrs_per_vertex * NUM_COMPONENTS);
@@ -97,7 +97,7 @@ renderer_draw(struct xa_context *r)
     xa_scissor_reset(r);
 }
 
-static INLINE void
+static inline void
 renderer_draw_conditional(struct xa_context *r, int next_batch)
 {
     if (r->buffer_size + next_batch >= XA_VB_SIZE ||
@@ -135,7 +135,7 @@ renderer_init_state(struct xa_context *r)
     }
 }
 
-static INLINE void
+static inline void
 add_vertex_color(struct xa_context *r, float x, float y, float color[4])
 {
     float *vertex = r->buffer + r->buffer_size;
@@ -153,7 +153,7 @@ add_vertex_color(struct xa_context *r, float x, float y, float color[4])
     r->buffer_size += 8;
 }
 
-static INLINE void
+static inline void
 add_vertex_1tex(struct xa_context *r, float x, float y, float s, float t)
 {
     float *vertex = r->buffer + r->buffer_size;
@@ -171,7 +171,7 @@ add_vertex_1tex(struct xa_context *r, float x, float y, float s, float t)
     r->buffer_size += 8;
 }
 
-static INLINE void
+static inline void
 add_vertex_2tex(struct xa_context *r,
 		float x, float y, float s0, float t0, float s1, float t1)
 {
diff --git a/src/gallium/state_trackers/xa/xa_tgsi.c b/src/gallium/state_trackers/xa/xa_tgsi.c
index c7454c9d6ac..5d8b8079c4b 100644
--- a/src/gallium/state_trackers/xa/xa_tgsi.c
+++ b/src/gallium/state_trackers/xa/xa_tgsi.c
@@ -106,7 +106,7 @@ struct xa_shaders {
     struct cso_hash *fs_hash;
 };
 
-static INLINE void
+static inline void
 src_in_mask(struct ureg_program *ureg,
 	    struct ureg_dst dst,
 	    struct ureg_src src,
@@ -368,7 +368,7 @@ create_yuv_shader(struct pipe_context *pipe, struct ureg_program *ureg)
     return ureg_create_shader_and_destroy(ureg, pipe);
 }
 
-static INLINE void
+static inline void
 xrender_tex(struct ureg_program *ureg,
 	    struct ureg_dst dst,
 	    struct ureg_src coords,
@@ -617,7 +617,7 @@ xa_shaders_destroy(struct xa_shaders *sc)
     FREE(sc);
 }
 
-static INLINE void *
+static inline void *
 shader_from_cache(struct pipe_context *pipe,
 		  unsigned type, struct cso_hash *hash, unsigned key)
 {
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index f69ac8edf27..21ca57ca633 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -153,7 +153,7 @@ xa_tracker_create(int drm_fd)
     loader_fd = dup(drm_fd);
     if (loader_fd == -1)
         return NULL;
-    if (pipe_loader_drm_probe_fd(&xa->dev, loader_fd, false))
+    if (pipe_loader_drm_probe_fd(&xa->dev, loader_fd))
 	xa->screen = pipe_loader_create_screen(xa->dev, PIPE_SEARCH_DIR);
 #endif
     if (!xa->screen)
@@ -461,7 +461,7 @@ xa_surface_redefine(struct xa_surface *srf,
 			xa_min(save_height, template->height0), &src_box);
 	pipe->resource_copy_region(pipe, texture,
 				   0, 0, 0, 0, srf->tex, 0, &src_box);
-	pipe->flush(pipe, &xa->default_ctx->last_fence, 0);
+	xa_context_flush(xa->default_ctx);
     }
 
     pipe_resource_reference(&srf->tex, texture);
diff --git a/src/gallium/state_trackers/xa/xa_yuv.c b/src/gallium/state_trackers/xa/xa_yuv.c
index 15196392ac7..97a1833ff15 100644
--- a/src/gallium/state_trackers/xa/xa_yuv.c
+++ b/src/gallium/state_trackers/xa/xa_yuv.c
@@ -154,7 +154,7 @@ xa_yuv_planar_blit(struct xa_context *r,
 	box++;
     }
 
-    r->pipe->flush(r->pipe, &r->last_fence, 0);
+    xa_context_flush(r);
 
     xa_ctx_sampler_views_destroy(r);
     xa_ctx_srf_destroy(r);
diff --git a/src/gallium/state_trackers/xvmc/Makefile.am b/src/gallium/state_trackers/xvmc/Makefile.am
index 047d05b3719..3c7c35c8c37 100644
--- a/src/gallium/state_trackers/xvmc/Makefile.am
+++ b/src/gallium/state_trackers/xvmc/Makefile.am
@@ -20,7 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
diff --git a/src/gallium/state_trackers/xvmc/surface.c b/src/gallium/state_trackers/xvmc/surface.c
index f32e85bf489..15eae59ff6e 100644
--- a/src/gallium/state_trackers/xvmc/surface.c
+++ b/src/gallium/state_trackers/xvmc/surface.c
@@ -489,7 +489,7 @@ Status XvMCGetSurfaceStatus(Display *dpy, XvMCSurface *surface, int *status)
    *status = 0;
 
    if (surface_priv->fence)
-      if (!pipe->screen->fence_signalled(pipe->screen, surface_priv->fence))
+      if (!pipe->screen->fence_finish(pipe->screen, surface_priv->fence, 0))
          *status |= XVMC_RENDERING;
 
    return Success;
diff --git a/src/gallium/state_trackers/xvmc/xvmc_private.h b/src/gallium/state_trackers/xvmc/xvmc_private.h
index 84c7b6cba0b..a1d026f704e 100644
--- a/src/gallium/state_trackers/xvmc/xvmc_private.h
+++ b/src/gallium/state_trackers/xvmc/xvmc_private.h
@@ -106,7 +106,7 @@ typedef struct
 #define XVMC_WARN  2
 #define XVMC_TRACE 3
 
-static INLINE void XVMC_MSG(int level, const char *fmt, ...)
+static inline void XVMC_MSG(int level, const char *fmt, ...)
 {
    static int debug_level = -1;
 
diff --git a/src/gallium/targets/d3dadapter9/Makefile.am b/src/gallium/targets/d3dadapter9/Makefile.am
index 591978f1f61..fe5b0b11679 100644
--- a/src/gallium/targets/d3dadapter9/Makefile.am
+++ b/src/gallium/targets/d3dadapter9/Makefile.am
@@ -118,8 +118,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 d3dadapter9_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/d3dadapter9/drm.c b/src/gallium/targets/d3dadapter9/drm.c
index 6342ab801a9..680f5164e60 100644
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -101,7 +101,7 @@ drm_destroy( struct d3dadapter9_context *ctx )
 
 /* read a DWORD in the form 0xnnnnnnnn, which is how sysfs pci id stuff is
  * formatted. */
-static INLINE DWORD
+static inline DWORD
 read_file_dword( const char *name )
 {
     char buf[32];
@@ -123,7 +123,7 @@ read_file_dword( const char *name )
  * dword at an offset in the raw PCI header. The reason this isn't used for all
  * data is that the kernel will make corrections but not expose them in the raw
  * header bytes. */
-static INLINE DWORD
+static inline DWORD
 read_config_dword( int fd,
                    unsigned offset )
 {
@@ -135,7 +135,7 @@ read_config_dword( int fd,
     return r;
 }
 
-static INLINE void
+static inline void
 get_bus_info( int fd,
               DWORD *vendorid,
               DWORD *deviceid,
@@ -160,7 +160,7 @@ get_bus_info( int fd,
     }
 }
 
-static INLINE void
+static inline void
 read_descriptor( struct d3dadapter9_context *ctx,
                  int fd )
 {
@@ -243,7 +243,7 @@ drm_create_adapter( int fd,
     ctx->base.hal = dd_create_screen(fd);
 #else
     /* use pipe-loader to dlopen appropriate drm driver */
-    if (!pipe_loader_drm_probe_fd(&ctx->dev, fd, FALSE)) {
+    if (!pipe_loader_drm_probe_fd(&ctx->dev, fd)) {
         ERR("Failed to probe drm fd %d.\n", fd);
         FREE(ctx);
         close(fd);
diff --git a/src/gallium/targets/dri-vdpau.dyn b/src/gallium/targets/dri-vdpau.dyn
index e5923a23b39..a7919f7d3ba 100644
--- a/src/gallium/targets/dri-vdpau.dyn
+++ b/src/gallium/targets/dri-vdpau.dyn
@@ -1,4 +1,5 @@
 {
 	nouveau_drm_screen_create;
 	radeon_drm_winsys_create;
+	amdgpu_winsys_create;
 };
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index 5ba129b7961..7168e1dbfb3 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -35,17 +35,15 @@ endif
 
 LOCAL_SRC_FILES := target.c
 
-LOCAL_CFLAGS := -DDRI_TARGET -DHAVE_LIBDRM
+LOCAL_CFLAGS := -DDRI_TARGET
 
 LOCAL_SHARED_LIBRARIES := \
 	libdl \
 	libglapi \
 	libexpat \
 
-# swrast only?
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS += -D__NOT_HAVE_DRM_H
-else
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES += libdrm
 endif
 
@@ -87,7 +85,7 @@ gallium_DRIVERS += libmesa_winsys_radeon libmesa_pipe_radeon
 LOCAL_SHARED_LIBRARIES += libdrm_radeon
 endif
 ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_softpipe libmesa_winsys_sw_dri libmesa_winsys_sw_kms_dri
+gallium_DRIVERS += libmesa_pipe_softpipe libmesa_winsys_sw_dri
 LOCAL_CFLAGS += -DGALLIUM_SOFTPIPE
 endif
 ifneq ($(filter vc4,$(MESA_GPU_DRIVERS)),)
diff --git a/src/gallium/targets/dri/Makefile.am b/src/gallium/targets/dri/Makefile.am
index 96483964589..7c86ea13652 100644
--- a/src/gallium/targets/dri/Makefile.am
+++ b/src/gallium/targets/dri/Makefile.am
@@ -95,8 +95,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 gallium_dri_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/dri/SConscript b/src/gallium/targets/dri/SConscript
index a51ed564344..2fb0da09200 100644
--- a/src/gallium/targets/dri/SConscript
+++ b/src/gallium/targets/dri/SConscript
@@ -25,11 +25,12 @@ if env['llvm']:
     env.Append(CPPDEFINES = 'GALLIUM_LLVMPIPE')
     env.Prepend(LIBS = [llvmpipe])
 
+env.PkgUseModules('DRM')
+
 env.Append(CPPDEFINES = [
     'GALLIUM_VMWGFX',
     'GALLIUM_SOFTPIPE',
     'DRI_TARGET',
-    'HAVE_LIBDRM',
 ])
 
 env.Prepend(LIBS = [
@@ -37,7 +38,6 @@ env.Prepend(LIBS = [
     svgadrm,
     svga,
     ws_dri,
-    ws_kms_dri,
     softpipe,
     libloader,
     mesautil,
@@ -58,9 +58,6 @@ module = env.LoadableModule(
 env.Command('vmwgfx_dri.so', 'gallium_dri.so', "ln -f ${SOURCE} ${TARGET}")
 # swrast_dri.so
 env.Command('swrast_dri.so', 'gallium_dri.so', "ln -f ${SOURCE} ${TARGET}")
-# kms_swrast_dri.so
-env.Command('kms_swrast_dri.so', 'gallium_dri.so', "ln -f ${SOURCE} ${TARGET}")
 
 env.Alias('dri-vmwgfx', module)
 env.Alias('dri-swrast', module)
-env.Alias('dri-kms-swrast', module)
diff --git a/src/gallium/targets/dri/dri.sym b/src/gallium/targets/dri/dri.sym
index 49a2cc9fcf2..8e26fb960b7 100644
--- a/src/gallium/targets/dri/dri.sym
+++ b/src/gallium/targets/dri/dri.sym
@@ -4,6 +4,7 @@
 		__driDriverGetExtensions*;
 		nouveau_drm_screen_create;
 		radeon_drm_winsys_create;
+		amdgpu_winsys_create;
 	local:
 		*;
 };
diff --git a/src/gallium/targets/omx/Makefile.am b/src/gallium/targets/omx/Makefile.am
index f52e66946ed..a4dff487dd8 100644
--- a/src/gallium/targets/omx/Makefile.am
+++ b/src/gallium/targets/omx/Makefile.am
@@ -57,8 +57,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 libomx_mesa_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am
index 5daf327fb47..4ab706ef2ac 100644
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 include $(top_srcdir)/src/gallium/Automake.inc
 
 lib_LTLIBRARIES = lib@OPENCL_LIBNAME@.la
@@ -7,7 +5,7 @@ lib_LTLIBRARIES = lib@OPENCL_LIBNAME@.la
 lib@OPENCL_LIBNAME@_la_LDFLAGS = \
 	$(LLVM_LDFLAGS) \
 	-no-undefined \
-	-version-number 1:0 \
+	-version-number @OPENCL_VERSION@:0 \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
 
@@ -17,12 +15,11 @@ lib@OPENCL_LIBNAME@_la_LDFLAGS += \
 endif
 
 lib@OPENCL_LIBNAME@_la_LIBADD = \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_client.la \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
 	$(top_builddir)/src/gallium/state_trackers/clover/libclover.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_CLIENT_LIBS) \
 	$(ELF_LIB) \
 	-ldl \
 	-lclangCodeGen \
diff --git a/src/gallium/targets/opencl/mesa.icd b/src/gallium/targets/opencl/mesa.icd
deleted file mode 100644
index 6a6a8706d7c..00000000000
--- a/src/gallium/targets/opencl/mesa.icd
+++ /dev/null
@@ -1 +0,0 @@
-libMesaOpenCL.so
diff --git a/src/gallium/targets/opencl/mesa.icd.in b/src/gallium/targets/opencl/mesa.icd.in
new file mode 100644
index 00000000000..1b77b4e4929
--- /dev/null
+++ b/src/gallium/targets/opencl/mesa.icd.in
@@ -0,0 +1 @@
+lib@OPENCL_LIBNAME@.so.@OPENCL_VERSION@
diff --git a/src/gallium/targets/pipe-loader/Makefile.am b/src/gallium/targets/pipe-loader/Makefile.am
index e4048b58605..4d9f7be2ec9 100644
--- a/src/gallium/targets/pipe-loader/Makefile.am
+++ b/src/gallium/targets/pipe-loader/Makefile.am
@@ -155,10 +155,12 @@ nodist_EXTRA_pipe_radeonsi_la_SOURCES = dummy.cpp
 pipe_radeonsi_la_LIBADD = \
 	$(PIPE_LIBS) \
 	$(top_builddir)/src/gallium/winsys/radeon/drm/libradeonwinsys.la \
+	$(top_builddir)/src/gallium/winsys/amdgpu/drm/libamdgpuwinsys.la \
 	$(top_builddir)/src/gallium/drivers/radeon/libradeon.la \
 	$(top_builddir)/src/gallium/drivers/radeonsi/libradeonsi.la \
 	$(LIBDRM_LIBS) \
-	$(RADEON_LIBS)
+	$(RADEON_LIBS) \
+	$(AMDGPU_LIBS)
 
 endif
 
diff --git a/src/gallium/targets/pipe-loader/pipe_radeonsi.c b/src/gallium/targets/pipe-loader/pipe_radeonsi.c
index 5457b5b5e32..31077af6a04 100644
--- a/src/gallium/targets/pipe-loader/pipe_radeonsi.c
+++ b/src/gallium/targets/pipe-loader/pipe_radeonsi.c
@@ -2,6 +2,7 @@
 #include "target-helpers/inline_debug_helper.h"
 #include "radeon/drm/radeon_drm_public.h"
 #include "radeon/radeon_winsys.h"
+#include "amdgpu/drm/amdgpu_public.h"
 #include "radeonsi/si_public.h"
 
 static struct pipe_screen *
@@ -9,7 +10,12 @@ create_screen(int fd)
 {
    struct radeon_winsys *rw;
 
-   rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+   /* First, try amdgpu. */
+   rw = amdgpu_winsys_create(fd, radeonsi_screen_create);
+
+   if (!rw)
+      rw = radeon_drm_winsys_create(fd, radeonsi_screen_create);
+
    return rw ? debug_screen_wrap(rw->screen) : NULL;
 }
 
diff --git a/src/gallium/targets/va/Makefile.am b/src/gallium/targets/va/Makefile.am
index 57c7e353ae9..9613f041b58 100644
--- a/src/gallium/targets/va/Makefile.am
+++ b/src/gallium/targets/va/Makefile.am
@@ -54,8 +54,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 gallium_drv_video_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/vdpau/Makefile.am b/src/gallium/targets/vdpau/Makefile.am
index 9455fc4cae5..7eb62c1cc78 100644
--- a/src/gallium/targets/vdpau/Makefile.am
+++ b/src/gallium/targets/vdpau/Makefile.am
@@ -66,8 +66,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 libvdpau_gallium_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/vdpau/vdpau.sym b/src/gallium/targets/vdpau/vdpau.sym
index f184193c055..5e71c6285a6 100644
--- a/src/gallium/targets/vdpau/vdpau.sym
+++ b/src/gallium/targets/vdpau/vdpau.sym
@@ -3,6 +3,7 @@
                vdp_imp_device_create_x11;
                nouveau_drm_screen_create;
                radeon_drm_winsys_create;
+               amdgpu_winsys_create;
        local:
                *;
 };
diff --git a/src/gallium/targets/xa/Makefile.am b/src/gallium/targets/xa/Makefile.am
index 8ddb9672bd7..92173dedce3 100644
--- a/src/gallium/targets/xa/Makefile.am
+++ b/src/gallium/targets/xa/Makefile.am
@@ -81,8 +81,7 @@ else # HAVE_GALLIUM_STATIC_TARGETS
 
 libxatracker_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/targets/xvmc/Makefile.am b/src/gallium/targets/xvmc/Makefile.am
index 3c16c8d51eb..b3285890822 100644
--- a/src/gallium/targets/xvmc/Makefile.am
+++ b/src/gallium/targets/xvmc/Makefile.am
@@ -52,11 +52,9 @@ libXvMCgallium_la_LIBADD += $(TARGET_LIB_DEPS) \
 	$(TARGET_RADEON_WINSYS) $(TARGET_RADEON_COMMON)
 
 else # HAVE_GALLIUM_STATIC_TARGETS
-# XXX: Use the pipe-loader-client over pipe-loader ?
 libXvMCgallium_la_LIBADD += \
 	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
-	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_LIBS)
+	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS)
 
 endif # HAVE_GALLIUM_STATIC_TARGETS
 
diff --git a/src/gallium/tests/graw/graw_util.h b/src/gallium/tests/graw/graw_util.h
index afcc584863e..e7cd0aa3ac3 100644
--- a/src/gallium/tests/graw/graw_util.h
+++ b/src/gallium/tests/graw/graw_util.h
@@ -26,7 +26,7 @@ struct graw_info
 
 
 
-static INLINE boolean
+static inline boolean
 graw_util_create_window(struct graw_info *info,
                         int width, int height,
                         int num_cbufs, bool zstencil_buf)
@@ -144,7 +144,7 @@ graw_util_create_window(struct graw_info *info,
 }
 
 
-static INLINE void
+static inline void
 graw_util_default_state(struct graw_info *info, boolean depth_test)
 {
    {
@@ -181,7 +181,7 @@ graw_util_default_state(struct graw_info *info, boolean depth_test)
 }
 
 
-static INLINE void
+static inline void
 graw_util_viewport(struct graw_info *info,
                    float x, float y,
                    float width, float height,
@@ -205,7 +205,7 @@ graw_util_viewport(struct graw_info *info,
 }
 
 
-static INLINE void
+static inline void
 graw_util_flush_front(const struct graw_info *info)
 {
    info->screen->flush_frontbuffer(info->screen, info->color_buf[0],
@@ -213,7 +213,7 @@ graw_util_flush_front(const struct graw_info *info)
 }
 
 
-static INLINE struct pipe_resource *
+static inline struct pipe_resource *
 graw_util_create_tex2d(const struct graw_info *info,
                        int width, int height, enum pipe_format format,
                        const void *data)
@@ -278,7 +278,7 @@ graw_util_create_tex2d(const struct graw_info *info,
 }
 
 
-static INLINE void *
+static inline void *
 graw_util_create_simple_sampler(const struct graw_info *info,
                                 unsigned wrap_mode,
                                 unsigned img_filter)
@@ -304,7 +304,7 @@ graw_util_create_simple_sampler(const struct graw_info *info,
 }
 
 
-static INLINE struct pipe_sampler_view *
+static inline struct pipe_sampler_view *
 graw_util_create_simple_sampler_view(const struct graw_info *info,
                                      struct pipe_resource *texture)
 {
diff --git a/src/gallium/tests/trivial/Makefile.am b/src/gallium/tests/trivial/Makefile.am
index fcd240e85bb..56b7f3ffc66 100644
--- a/src/gallium/tests/trivial/Makefile.am
+++ b/src/gallium/tests/trivial/Makefile.am
@@ -12,11 +12,10 @@ AM_CPPFLAGS = \
 	$(GALLIUM_PIPE_LOADER_DEFINES)
 
 LDADD = \
-	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_client.la \
+	$(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \
-	$(GALLIUM_PIPE_LOADER_CLIENT_LIBS) \
 	$(GALLIUM_COMMON_LIB_DEPS)
 
 noinst_PROGRAMS = compute tri quad-tex
diff --git a/src/gallium/tests/trivial/quad-tex.c b/src/gallium/tests/trivial/quad-tex.c
index daae577ec4b..c019c7bb0a3 100644
--- a/src/gallium/tests/trivial/quad-tex.c
+++ b/src/gallium/tests/trivial/quad-tex.c
@@ -297,6 +297,8 @@ static void close_prog(struct program *p)
 
 static void draw(struct program *p)
 {
+	const struct pipe_sampler_state *samplers[] = {&p->sampler};
+
 	/* set the render target */
 	cso_set_framebuffer(p->cso, &p->framebuffer);
 
@@ -310,8 +312,7 @@ static void draw(struct program *p)
 	cso_set_viewport(p->cso, &p->viewport);
 
 	/* sampler */
-	cso_single_sampler(p->cso, PIPE_SHADER_FRAGMENT, 0, &p->sampler);
-	cso_single_sampler_done(p->cso, PIPE_SHADER_FRAGMENT);
+	cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
 
 	/* texture sampler view */
 	cso_set_sampler_views(p->cso, PIPE_SHADER_FRAGMENT, 1, &p->view);
diff --git a/src/gallium/winsys/sw/kms-dri/Android.mk b/src/gallium/winsys/amdgpu/drm/Android.mk
similarity index 87%
rename from src/gallium/winsys/sw/kms-dri/Android.mk
rename to src/gallium/winsys/amdgpu/drm/Android.mk
index b065242aaf3..7d507aa79c6 100644
--- a/src/gallium/winsys/sw/kms-dri/Android.mk
+++ b/src/gallium/winsys/amdgpu/drm/Android.mk
@@ -1,7 +1,7 @@
 # Mesa 3-D graphics library
 #
-# Copyright (C) 2015 Chih-Wei Huang <cwhuang@linux.org.tw>
-# Copyright (C) 2015 Android-x86 Open Source Project
+# Copyright (C) 2011 Chia-I Wu <olvaffe@gmail.com>
+# Copyright (C) 2011 LunarG Inc.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -23,15 +23,15 @@
 
 LOCAL_PATH := $(call my-dir)
 
+# get C_SOURCES
 include $(LOCAL_PATH)/Makefile.sources
 
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
-LOCAL_MODULE := libmesa_winsys_sw_kms_dri
-
-LOCAL_SHARED_LIBRARIES := libdrm
+LOCAL_SHARED_LIBRARIES := libdrm libdrm_amdgpu
+LOCAL_MODULE := libmesa_winsys_amdgpu
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/winsys/amdgpu/drm/Makefile.am b/src/gallium/winsys/amdgpu/drm/Makefile.am
new file mode 100644
index 00000000000..a719913b157
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/Makefile.am
@@ -0,0 +1,17 @@
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CFLAGS = \
+	$(GALLIUM_WINSYS_CFLAGS) \
+	$(AMDGPU_CFLAGS) \
+	-I$(srcdir)/addrlib \
+	-I$(srcdir)/addrlib/core \
+	-I$(srcdir)/addrlib/inc/chip/r800 \
+	-I$(srcdir)/addrlib/r800/chip \
+	-DBRAHMA_BUILD=1
+
+AM_CXXFLAGS = $(AM_CFLAGS)
+
+noinst_LTLIBRARIES = libamdgpuwinsys.la
+
+libamdgpuwinsys_la_SOURCES = $(C_SOURCES)
diff --git a/src/gallium/winsys/amdgpu/drm/Makefile.sources b/src/gallium/winsys/amdgpu/drm/Makefile.sources
new file mode 100644
index 00000000000..6b33841b204
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/Makefile.sources
@@ -0,0 +1,31 @@
+C_SOURCES := \
+	addrlib/addrinterface.cpp \
+	addrlib/addrinterface.h \
+	addrlib/addrtypes.h \
+	addrlib/core/addrcommon.h \
+	addrlib/core/addrelemlib.cpp \
+	addrlib/core/addrelemlib.h \
+	addrlib/core/addrlib.cpp \
+	addrlib/core/addrlib.h \
+	addrlib/core/addrobject.cpp \
+	addrlib/core/addrobject.h \
+	addrlib/inc/chip/r800/si_gb_reg.h \
+	addrlib/inc/lnx_common_defs.h \
+	addrlib/r800/chip/si_ci_merged_enum.h \
+	addrlib/r800/chip/si_ci_vi_merged_enum.h \
+	addrlib/r800/chip/si_enum.h \
+	addrlib/r800/ciaddrlib.cpp \
+	addrlib/r800/ciaddrlib.h \
+	addrlib/r800/egbaddrlib.cpp \
+	addrlib/r800/egbaddrlib.h \
+	addrlib/r800/siaddrlib.cpp \
+	addrlib/r800/siaddrlib.h \
+	amdgpu_bo.c \
+	amdgpu_bo.h \
+	amdgpu_cs.c \
+	amdgpu_cs.h \
+	amdgpu_id.h \
+	amdgpu_public.h \
+	amdgpu_surface.c \
+	amdgpu_winsys.c \
+	amdgpu_winsys.h
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.cpp
new file mode 100644
index 00000000000..65569278b1e
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.cpp
@@ -0,0 +1,1008 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrinterface.cpp
+* @brief Contains the addrlib interface functions
+***************************************************************************************************
+*/
+#include "addrinterface.h"
+#include "addrlib.h"
+
+#include "addrcommon.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Create/Destroy/Config functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrCreate
+*
+*   @brief
+*       Create address lib object
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrCreate(
+    const ADDR_CREATE_INPUT*    pAddrCreateIn,  ///< [in] infomation for creating address lib object
+    ADDR_CREATE_OUTPUT*         pAddrCreateOut) ///< [out] address lib handle
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    returnCode = AddrLib::Create(pAddrCreateIn, pAddrCreateOut);
+
+    return returnCode;
+}
+
+
+
+/**
+***************************************************************************************************
+*   AddrDestroy
+*
+*   @brief
+*       Destroy address lib object
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrDestroy(
+    ADDR_HANDLE hLib) ///< [in] address lib handle
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (hLib)
+    {
+        AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+        pLib->Destroy();
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                    Surface functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceInfo
+*
+*   @brief
+*       Calculate surface width/height/depth/alignments and suitable tiling mode
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceInfo(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,  ///< [in] surface information
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut) ///< [out] surface parameters and alignments
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSurfaceInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Compute surface address according to coordinates
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceAddrFromCoord(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,  ///< [in] surface info and coordinates
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut) ///< [out] surface address
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSurfaceAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceCoordFromAddr
+*
+*   @brief
+*       Compute coordinates according to surface address
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceCoordFromAddr(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,  ///< [in] surface info and address
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut) ///< [out] coordinates
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSurfaceCoordFromAddr(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                   HTile functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileInfo
+*
+*   @brief
+*       Compute Htile pitch, height, base alignment and size in bytes
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileInfo(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_HTILE_INFO_INPUT*    pIn,  ///< [in] Htile information
+    ADDR_COMPUTE_HTILE_INFO_OUTPUT*         pOut) ///< [out] Htile pitch, height and size in bytes
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeHtileInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileAddrFromCoord
+*
+*   @brief
+*       Compute Htile address according to coordinates (of depth buffer)
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileAddrFromCoord(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*   pIn,  ///< [in] Htile info and coordinates
+    ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT*        pOut) ///< [out] Htile address
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeHtileAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileCoordFromAddr
+*
+*   @brief
+*       Compute coordinates within depth buffer (1st pixel of a micro tile) according to
+*       Htile address
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileCoordFromAddr(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT*   pIn,  ///< [in] Htile info and address
+    ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT*        pOut) ///< [out] Htile coordinates
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeHtileCoordFromAddr(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     C-mask functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskInfo
+*
+*   @brief
+*       Compute Cmask pitch, height, base alignment and size in bytes from color buffer
+*       info
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskInfo(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_CMASK_INFO_INPUT*    pIn,  ///< [in] Cmask pitch and height
+    ADDR_COMPUTE_CMASK_INFO_OUTPUT*         pOut) ///< [out] Cmask pitch, height and size in bytes
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeCmaskInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskAddrFromCoord
+*
+*   @brief
+*       Compute Cmask address according to coordinates (of MSAA color buffer)
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskAddrFromCoord(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*   pIn,  ///< [in] Cmask info and coordinates
+    ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT*        pOut) ///< [out] Cmask address
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeCmaskAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskCoordFromAddr
+*
+*   @brief
+*       Compute coordinates within color buffer (1st pixel of a micro tile) according to
+*       Cmask address
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskCoordFromAddr(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT*   pIn,  ///< [in] Cmask info and address
+    ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT*        pOut) ///< [out] Cmask coordinates
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeCmaskCoordFromAddr(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     F-mask functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskInfo
+*
+*   @brief
+*       Compute Fmask pitch/height/depth/alignments and size in bytes
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskInfo(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,  ///< [in] Fmask information
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut) ///< [out] Fmask pitch and height
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeFmaskInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskAddrFromCoord
+*
+*   @brief
+*       Compute Fmask address according to coordinates (x,y,slice,sample,plane)
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskAddrFromCoord(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,  ///< [in] Fmask info and coordinates
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut) ///< [out] Fmask address
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeFmaskAddrFromCoord(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskCoordFromAddr
+*
+*   @brief
+*       Compute coordinates (x,y,slice,sample,plane) according to Fmask address
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskCoordFromAddr(
+    ADDR_HANDLE                                     hLib, ///< [in] address lib handle
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*   pIn,  ///< [in] Fmask info and address
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT*        pOut) ///< [out] Fmask coordinates
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeFmaskCoordFromAddr(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     DCC key functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrComputeDccInfo
+*
+*   @brief
+*       Compute DCC key size, base alignment based on color surface size, tile info or tile index
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeDccInfo(
+    ADDR_HANDLE                             hLib,   ///< [in] handle of addrlib
+    const ADDR_COMPUTE_DCCINFO_INPUT*       pIn,    ///< [in] input
+    ADDR_COMPUTE_DCCINFO_OUTPUT*            pOut)   ///< [out] output
+{
+    ADDR_E_RETURNCODE returnCode;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+       returnCode = pLib->ComputeDccInfo(pIn, pOut);
+    }
+    else
+    {
+       returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Below functions are element related or helper functions
+///////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrGetVersion
+*
+*   @brief
+*       Get AddrLib version number. Client may check this return value against ADDRLIB_VERSION
+*       defined in addrinterface.h to see if there is a mismatch.
+***************************************************************************************************
+*/
+UINT_32 ADDR_API AddrGetVersion(ADDR_HANDLE hLib)
+{
+    UINT_32 version = 0;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_ASSERT(pLib != NULL);
+
+    if (pLib)
+    {
+        version = pLib->GetVersion();
+    }
+
+    return version;
+}
+
+/**
+***************************************************************************************************
+*   AddrUseTileIndex
+*
+*   @brief
+*       Return TRUE if tileIndex is enabled in this address library
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API AddrUseTileIndex(ADDR_HANDLE hLib)
+{
+    BOOL_32 useTileIndex = FALSE;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_ASSERT(pLib != NULL);
+
+    if (pLib)
+    {
+        useTileIndex = pLib->UseTileIndex(0);
+    }
+
+    return useTileIndex;
+}
+
+/**
+***************************************************************************************************
+*   AddrUseCombinedSwizzle
+*
+*   @brief
+*       Return TRUE if combined swizzle is enabled in this address library
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API AddrUseCombinedSwizzle(ADDR_HANDLE hLib)
+{
+    BOOL_32 useCombinedSwizzle = FALSE;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_ASSERT(pLib != NULL);
+
+    if (pLib)
+    {
+        useCombinedSwizzle = pLib->UseCombinedSwizzle();
+    }
+
+    return useCombinedSwizzle;
+}
+
+/**
+***************************************************************************************************
+*   AddrExtractBankPipeSwizzle
+*
+*   @brief
+*       Extract Bank and Pipe swizzle from base256b
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrExtractBankPipeSwizzle(
+    ADDR_HANDLE                                 hLib,     ///< [in] addrlib handle
+    const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT*  pIn,      ///< [in] input structure
+    ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT*       pOut)     ///< [out] output structure
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ExtractBankPipeSwizzle(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrCombineBankPipeSwizzle
+*
+*   @brief
+*       Combine Bank and Pipe swizzle
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrCombineBankPipeSwizzle(
+    ADDR_HANDLE                                 hLib,
+    const ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT*       pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->CombineBankPipeSwizzle(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeSliceSwizzle
+*
+*   @brief
+*       Compute a swizzle for slice from a base swizzle
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSliceSwizzle(
+    ADDR_HANDLE                                 hLib,
+    const ADDR_COMPUTE_SLICESWIZZLE_INPUT*      pIn,
+    ADDR_COMPUTE_SLICESWIZZLE_OUTPUT*           pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeSliceTileSwizzle(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputeBaseSwizzle
+*
+*   @brief
+*       Return a Combined Bank and Pipe swizzle base on surface based on surface type/index
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeBaseSwizzle(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_BASE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT*       pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputeBaseSwizzle(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   ElemFlt32ToDepthPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a depth/stencil pixel value
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API ElemFlt32ToDepthPixel(
+    ADDR_HANDLE                         hLib,    ///< [in] addrlib handle
+    const ELEM_FLT32TODEPTHPIXEL_INPUT* pIn,     ///< [in] per-component value
+    ELEM_FLT32TODEPTHPIXEL_OUTPUT*      pOut)    ///< [out] final pixel value
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        pLib->Flt32ToDepthPixel(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   ElemFlt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a red/green/blue/alpha pixel value
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API ElemFlt32ToColorPixel(
+    ADDR_HANDLE                         hLib,    ///< [in] addrlib handle
+    const ELEM_FLT32TOCOLORPIXEL_INPUT* pIn,     ///< [in] format, surface number and swap value
+    ELEM_FLT32TOCOLORPIXEL_OUTPUT*      pOut)    ///< [out] final pixel value
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        pLib->Flt32ToColorPixel(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   ElemGetExportNorm
+*
+*   @brief
+*       Helper function to check one format can be EXPORT_NUM,
+*       which is a register CB_COLOR_INFO.SURFACE_FORMAT.
+*       FP16 can be reported as EXPORT_NORM for rv770 in r600
+*       family
+*
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API ElemGetExportNorm(
+    ADDR_HANDLE                     hLib, ///< [in] addrlib handle
+    const ELEM_GETEXPORTNORM_INPUT* pIn)  ///< [in] input structure
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+    BOOL_32 enabled = FALSE;
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        enabled = pLib->GetExportNorm(pIn);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    ADDR_ASSERT(returnCode == ADDR_OK);
+
+    return enabled;
+}
+
+/**
+***************************************************************************************************
+*   AddrConvertTileInfoToHW
+*
+*   @brief
+*       Convert tile info from real value to hardware register value
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileInfoToHW(
+    ADDR_HANDLE                             hLib, ///< [in] address lib handle
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT*  pIn,  ///< [in] tile info with real value
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT*       pOut) ///< [out] tile info with HW register value
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ConvertTileInfoToHW(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrConvertTileIndex
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileIndex(
+    ADDR_HANDLE                          hLib, ///< [in] address lib handle
+    const ADDR_CONVERT_TILEINDEX_INPUT*  pIn,  ///< [in] input - tile index
+    ADDR_CONVERT_TILEINDEX_OUTPUT*       pOut) ///< [out] tile mode/type/info
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ConvertTileIndex(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrConvertTileIndex1
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileIndex1(
+    ADDR_HANDLE                          hLib, ///< [in] address lib handle
+    const ADDR_CONVERT_TILEINDEX1_INPUT* pIn,  ///< [in] input - tile index
+    ADDR_CONVERT_TILEINDEX_OUTPUT*       pOut) ///< [out] tile mode/type/info
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ConvertTileIndex1(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrGetTileIndex
+*
+*   @brief
+*       Get tile index from tile mode/type/info
+*
+*   @return
+*       ADDR_OK if successful, otherwise an error code of ADDR_E_RETURNCODE
+*
+*   @note
+*       Only meaningful for SI (and above)
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrGetTileIndex(
+    ADDR_HANDLE                     hLib,
+    const ADDR_GET_TILEINDEX_INPUT* pIn,
+    ADDR_GET_TILEINDEX_OUTPUT*      pOut)
+{
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->GetTileIndex(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrComputePrtInfo
+*
+*   @brief
+*       Interface function for ComputePrtInfo
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputePrtInfo(
+    ADDR_HANDLE                 hLib,
+    const ADDR_PRT_INFO_INPUT*  pIn,
+    ADDR_PRT_INFO_OUTPUT*       pOut)
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    AddrLib* pLib = AddrLib::GetAddrLib(hLib);
+
+    if (pLib != NULL)
+    {
+        returnCode = pLib->ComputePrtInfo(pIn, pOut);
+    }
+    else
+    {
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h
new file mode 100644
index 00000000000..03fbf2bd0ee
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h
@@ -0,0 +1,2166 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrinterface.h
+* @brief Contains the addrlib interfaces declaration and parameter defines
+***************************************************************************************************
+*/
+#ifndef __ADDR_INTERFACE_H__
+#define __ADDR_INTERFACE_H__
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#include "addrtypes.h"
+
+#define ADDRLIB_VERSION_MAJOR 5
+#define ADDRLIB_VERSION_MINOR 25
+#define ADDRLIB_VERSION ((ADDRLIB_VERSION_MAJOR << 16) | ADDRLIB_VERSION_MINOR)
+
+/// Virtually all interface functions need ADDR_HANDLE as first parameter
+typedef VOID*   ADDR_HANDLE;
+
+/// Client handle used in callbacks
+typedef VOID*   ADDR_CLIENT_HANDLE;
+
+/**
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                  Callback functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*    typedef VOID* (ADDR_API* ADDR_ALLOCSYSMEM)(
+*         const ADDR_ALLOCSYSMEM_INPUT* pInput);
+*    typedef ADDR_E_RETURNCODE (ADDR_API* ADDR_FREESYSMEM)(
+*         VOID* pVirtAddr);
+*    typedef ADDR_E_RETURNCODE (ADDR_API* ADDR_DEBUGPRINT)(
+*         const ADDR_DEBUGPRINT_INPUT* pInput);
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                               Create/Destroy/Config functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrCreate()
+*     AddrDestroy()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                  Surface functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrComputeSurfaceInfo()
+*     AddrComputeSurfaceAddrFromCoord()
+*     AddrComputeSurfaceCoordFromAddr()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                   HTile functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrComputeHtileInfo()
+*     AddrComputeHtileAddrFromCoord()
+*     AddrComputeHtileCoordFromAddr()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                   C-mask functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrComputeCmaskInfo()
+*     AddrComputeCmaskAddrFromCoord()
+*     AddrComputeCmaskCoordFromAddr()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                   F-mask functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrComputeFmaskInfo()
+*     AddrComputeFmaskAddrFromCoord()
+*     AddrComputeFmaskCoordFromAddr()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                               Element/Utility functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     ElemFlt32ToDepthPixel()
+*     ElemFlt32ToColorPixel()
+*     AddrExtractBankPipeSwizzle()
+*     AddrCombineBankPipeSwizzle()
+*     AddrComputeSliceSwizzle()
+*     AddrConvertTileInfoToHW()
+*     AddrConvertTileIndex()
+*     AddrConvertTileIndex1()
+*     AddrGetTileIndex()
+*     AddrComputeBaseSwizzle()
+*     AddrUseTileIndex()
+*     AddrUseCombinedSwizzle()
+*
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+* //                                    Dump functions
+* /////////////////////////////////////////////////////////////////////////////////////////////////
+*     AddrDumpSurfaceInfo()
+*     AddrDumpFmaskInfo()
+*     AddrDumpCmaskInfo()
+*     AddrDumpHtileInfo()
+*
+**/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                      Callback functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+* @brief Alloc system memory flags.
+* @note These flags are reserved for future use and if flags are added will minimize the impact
+*       of the client.
+***************************************************************************************************
+*/
+typedef union _ADDR_ALLOCSYSMEM_FLAGS
+{
+    struct
+    {
+        UINT_32 reserved    : 32;  ///< Reserved for future use.
+    } fields;
+    UINT_32 value;
+
+} ADDR_ALLOCSYSMEM_FLAGS;
+
+/**
+***************************************************************************************************
+* @brief Alloc system memory input structure
+***************************************************************************************************
+*/
+typedef struct _ADDR_ALLOCSYSMEM_INPUT
+{
+    UINT_32                 size;           ///< Size of this structure in bytes
+
+    ADDR_ALLOCSYSMEM_FLAGS  flags;          ///< System memory flags.
+    UINT_32                 sizeInBytes;    ///< System memory allocation size in bytes.
+    ADDR_CLIENT_HANDLE      hClient;        ///< Client handle
+} ADDR_ALLOCSYSMEM_INPUT;
+
+/**
+***************************************************************************************************
+* ADDR_ALLOCSYSMEM
+*   @brief
+*       Allocate system memory callback function. Returns valid pointer on success.
+***************************************************************************************************
+*/
+typedef VOID* (ADDR_API* ADDR_ALLOCSYSMEM)(
+    const ADDR_ALLOCSYSMEM_INPUT* pInput);
+
+/**
+***************************************************************************************************
+* @brief Free system memory input structure
+***************************************************************************************************
+*/
+typedef struct _ADDR_FREESYSMEM_INPUT
+{
+    UINT_32                 size;           ///< Size of this structure in bytes
+
+    VOID*                   pVirtAddr;      ///< Virtual address
+    ADDR_CLIENT_HANDLE      hClient;        ///< Client handle
+} ADDR_FREESYSMEM_INPUT;
+
+/**
+***************************************************************************************************
+* ADDR_FREESYSMEM
+*   @brief
+*       Free system memory callback function.
+*       Returns ADDR_OK on success.
+***************************************************************************************************
+*/
+typedef ADDR_E_RETURNCODE (ADDR_API* ADDR_FREESYSMEM)(
+    const ADDR_FREESYSMEM_INPUT* pInput);
+
+/**
+***************************************************************************************************
+* @brief Print debug message input structure
+***************************************************************************************************
+*/
+typedef struct _ADDR_DEBUGPRINT_INPUT
+{
+    UINT_32             size;           ///< Size of this structure in bytes
+
+    CHAR*               pDebugString;   ///< Debug print string
+    va_list             ap;             ///< Variable argument list
+    ADDR_CLIENT_HANDLE  hClient;        ///< Client handle
+} ADDR_DEBUGPRINT_INPUT;
+
+/**
+***************************************************************************************************
+* ADDR_DEBUGPRINT
+*   @brief
+*       Print debug message callback function.
+*       Returns ADDR_OK on success.
+***************************************************************************************************
+*/
+typedef ADDR_E_RETURNCODE (ADDR_API* ADDR_DEBUGPRINT)(
+    const ADDR_DEBUGPRINT_INPUT* pInput);
+
+/**
+***************************************************************************************************
+* ADDR_CALLBACKS
+*
+*   @brief
+*       Address Library needs client to provide system memory alloc/free routines.
+***************************************************************************************************
+*/
+typedef struct _ADDR_CALLBACKS
+{
+    ADDR_ALLOCSYSMEM allocSysMem;   ///< Routine to allocate system memory
+    ADDR_FREESYSMEM  freeSysMem;    ///< Routine to free system memory
+    ADDR_DEBUGPRINT  debugPrint;    ///< Routine to print debug message
+} ADDR_CALLBACKS;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Create/Destroy functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+* ADDR_CREATE_FLAGS
+*
+*   @brief
+*       This structure is used to pass some setup in creation of AddrLib
+*   @note
+***************************************************************************************************
+*/
+typedef union _ADDR_CREATE_FLAGS
+{
+    struct
+    {
+        UINT_32 noCubeMipSlicesPad     : 1;    ///< Turn cubemap faces padding off
+        UINT_32 fillSizeFields         : 1;    ///< If clients fill size fields in all input and
+                                               ///  output structure
+        UINT_32 useTileIndex           : 1;    ///< Make tileIndex field in input valid
+        UINT_32 useCombinedSwizzle     : 1;    ///< Use combined tile swizzle
+        UINT_32 checkLast2DLevel       : 1;    ///< Check the last 2D mip sub level
+        UINT_32 useHtileSliceAlign     : 1;    ///< Do htile single slice alignment
+        UINT_32 degradeBaseLevel       : 1;    ///< Degrade to 1D modes automatically for base level
+        UINT_32 allowLargeThickTile    : 1;    ///< Allow 64*thickness*bytesPerPixel > rowSize
+        UINT_32 reserved               : 24;   ///< Reserved bits for future use
+    };
+
+    UINT_32 value;
+} ADDR_CREATE_FLAGS;
+
+/**
+***************************************************************************************************
+*   ADDR_REGISTER_VALUE
+*
+*   @brief
+*       Data from registers to setup AddrLib global data, used in AddrCreate
+***************************************************************************************************
+*/
+typedef struct _ADDR_REGISTER_VALUE
+{
+    UINT_32  gbAddrConfig;       ///< For R8xx, use GB_ADDR_CONFIG register value.
+                                 ///  For R6xx/R7xx, use GB_TILING_CONFIG.
+                                 ///  But they can be treated as the same.
+                                 ///  if this value is 0, use chip to set default value
+    UINT_32  backendDisables;    ///< 1 bit per backend, starting with LSB. 1=disabled,0=enabled.
+                                 ///  Register value of CC_RB_BACKEND_DISABLE.BACKEND_DISABLE
+
+                                 ///  R800 registers-----------------------------------------------
+    UINT_32  noOfBanks;          ///< Number of h/w ram banks - For r800: MC_ARB_RAMCFG.NOOFBANK
+                                 ///  No enums for this value in h/w header files
+                                 ///  0: 4
+                                 ///  1: 8
+                                 ///  2: 16
+    UINT_32  noOfRanks;          ///  MC_ARB_RAMCFG.NOOFRANK
+                                 ///  0: 1
+                                 ///  1: 2
+                                 ///  SI (R1000) registers-----------------------------------------
+    const UINT_32* pTileConfig;  ///< Global tile setting tables
+    UINT_32  noOfEntries;        ///< Number of entries in pTileConfig
+
+                                 ///< CI registers-------------------------------------------------
+    const UINT_32* pMacroTileConfig;    ///< Global macro tile mode table
+    UINT_32  noOfMacroEntries;   ///< Number of entries in pMacroTileConfig
+
+} ADDR_REGISTER_VALUE;
+
+/**
+***************************************************************************************************
+* ADDR_CREATE_INPUT
+*
+*   @brief
+*       Parameters use to create an AddrLib Object. Caller must provide all fields.
+*
+***************************************************************************************************
+*/
+typedef struct _ADDR_CREATE_INPUT
+{
+    UINT_32             size;                ///< Size of this structure in bytes
+
+    UINT_32             chipEngine;          ///< Chip Engine
+    UINT_32             chipFamily;          ///< Chip Family
+    UINT_32             chipRevision;        ///< Chip Revision
+    ADDR_CALLBACKS      callbacks;           ///< Callbacks for sysmem alloc/free/print
+    ADDR_CREATE_FLAGS   createFlags;         ///< Flags to setup AddrLib
+    ADDR_REGISTER_VALUE regValue;            ///< Data from registers to setup AddrLib global data
+    ADDR_CLIENT_HANDLE  hClient;             ///< Client handle
+    UINT_32             minPitchAlignPixels; ///< Minimum pitch alignment in pixels
+} ADDR_CREATE_INPUT;
+
+/**
+***************************************************************************************************
+* ADDR_CREATEINFO_OUTPUT
+*
+*   @brief
+*       Return AddrLib handle to client driver
+*
+***************************************************************************************************
+*/
+typedef struct _ADDR_CREATE_OUTPUT
+{
+    UINT_32     size;    ///< Size of this structure in bytes
+
+    ADDR_HANDLE hLib;    ///< Address lib handle
+} ADDR_CREATE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrCreate
+*
+*   @brief
+*       Create AddrLib object, must be called before any interface calls
+*
+*   @return
+*       ADDR_OK if successful
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrCreate(
+    const ADDR_CREATE_INPUT*    pAddrCreateIn,
+    ADDR_CREATE_OUTPUT*         pAddrCreateOut);
+
+
+
+/**
+***************************************************************************************************
+*   AddrDestroy
+*
+*   @brief
+*       Destroy AddrLib object, must be called to free internally allocated resources.
+*
+*   @return
+*      ADDR_OK if successful
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrDestroy(
+    ADDR_HANDLE hLib);
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                    Surface functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+* @brief
+*       Bank/tiling parameters. On function input, these can be set as desired or
+*       left 0 for AddrLib to calculate/default. On function output, these are the actual
+*       parameters used.
+* @note
+*       Valid bankWidth/bankHeight value:
+*       1,2,4,8. They are factors instead of pixels or bytes.
+*
+*       The bank number remains constant across each row of the
+*       macro tile as each pipe is selected, so the number of
+*       tiles in the x direction with the same bank number will
+*       be bank_width * num_pipes.
+***************************************************************************************************
+*/
+typedef struct _ADDR_TILEINFO
+{
+    ///  Any of these parameters can be set to 0 to use the HW default.
+    UINT_32     banks;              ///< Number of banks, numerical value
+    UINT_32     bankWidth;          ///< Number of tiles in the X direction in the same bank
+    UINT_32     bankHeight;         ///< Number of tiles in the Y direction in the same bank
+    UINT_32     macroAspectRatio;   ///< Macro tile aspect ratio. 1-1:1, 2-4:1, 4-16:1, 8-64:1
+    UINT_32     tileSplitBytes;     ///< Tile split size, in bytes
+    AddrPipeCfg pipeConfig;         ///< Pipe Config = HW enum + 1
+} ADDR_TILEINFO;
+
+// Create a define to avoid client change. The removal of R800 is because we plan to implement SI
+// within 800 HWL - An AddrPipeCfg is added in above data structure
+typedef ADDR_TILEINFO ADDR_R800_TILEINFO;
+
+/**
+***************************************************************************************************
+* @brief
+*       Information needed by quad buffer stereo support
+***************************************************************************************************
+*/
+typedef struct _ADDR_QBSTEREOINFO
+{
+    UINT_32         eyeHeight;          ///< Height (in pixel rows) to right eye
+    UINT_32         rightOffset;        ///< Offset (in bytes) to right eye
+    UINT_32         rightSwizzle;       ///< TileSwizzle for right eyes
+} ADDR_QBSTEREOINFO;
+
+/**
+***************************************************************************************************
+*   ADDR_SURFACE_FLAGS
+*
+*   @brief
+*       Surface flags
+***************************************************************************************************
+*/
+typedef union _ADDR_SURFACE_FLAGS
+{
+    struct
+    {
+        UINT_32 color         : 1; ///< Flag indicates this is a color buffer
+        UINT_32 depth         : 1; ///< Flag indicates this is a depth/stencil buffer
+        UINT_32 stencil       : 1; ///< Flag indicates this is a stencil buffer
+        UINT_32 texture       : 1; ///< Flag indicates this is a texture
+        UINT_32 cube          : 1; ///< Flag indicates this is a cubemap
+
+        UINT_32 volume        : 1; ///< Flag indicates this is a volume texture
+        UINT_32 fmask         : 1; ///< Flag indicates this is an fmask
+        UINT_32 cubeAsArray   : 1; ///< Flag indicates if treat cubemap as arrays
+        UINT_32 compressZ     : 1; ///< Flag indicates z buffer is compressed
+        UINT_32 overlay       : 1; ///< Flag indicates this is an overlay surface
+        UINT_32 noStencil     : 1; ///< Flag indicates this depth has no separate stencil
+        UINT_32 display       : 1; ///< Flag indicates this should match display controller req.
+        UINT_32 opt4Space     : 1; ///< Flag indicates this surface should be optimized for space
+                                   ///  i.e. save some memory but may lose performance
+        UINT_32 prt           : 1; ///< Flag for partially resident texture
+        UINT_32 qbStereo      : 1; ///< Quad buffer stereo surface
+        UINT_32 pow2Pad       : 1; ///< SI: Pad to pow2, must set for mipmap (include level0)
+        UINT_32 interleaved   : 1; ///< Special flag for interleaved YUV surface padding
+        UINT_32 degrade4Space : 1; ///< Degrade base level's tile mode to save memory
+        UINT_32 tcCompatible  : 1; ///< Flag indicates surface needs to be shader readable
+        UINT_32 dispTileType  : 1; ///< NI: force display Tiling for 128 bit shared resoruce
+        UINT_32 dccCompatible : 1; ///< VI: whether to support dcc fast clear
+        UINT_32 czDispCompatible: 1; ///< SI+: CZ family (Carrizo) has a HW bug needs special alignment.
+                                     ///<      This flag indicates we need to follow the alignment with
+                                     ///<      CZ families or other ASICs under PX configuration + CZ.
+        UINT_32 reserved      :10; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR_SURFACE_FLAGS;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_INFO_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeSurfaceInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_INFO_INPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+
+    AddrTileMode        tileMode;           ///< Tile mode
+    AddrFormat          format;             ///< If format is set to valid one, bpp/width/height
+                                            ///  might be overwritten
+    UINT_32             bpp;                ///< Bits per pixel
+    UINT_32             numSamples;         ///< Number of samples
+    UINT_32             width;              ///< Width, in pixels
+    UINT_32             height;             ///< Height, in pixels
+    UINT_32             numSlices;          ///< Number surface slice/depth,
+                                            ///  Note:
+                                            ///  For cubemap, driver clients usually set numSlices
+                                            ///  to 1 in per-face calc.
+                                            ///  For 7xx and above, we need pad faces as slices.
+                                            ///  In this case, clients should set numSlices to 6 and
+                                            ///  this is also can be turned off by createFlags when
+                                            ///  calling AddrCreate
+    UINT_32             slice;              ///< Slice index
+    UINT_32             mipLevel;           ///< Current mipmap level.
+                                            ///  Padding/tiling have different rules for level0 and
+                                            ///  sublevels
+    ADDR_SURFACE_FLAGS  flags;              ///< Surface type flags
+    UINT_32             numFrags;           ///< Number of fragments, leave it zero or the same as
+                                            ///  number of samples for normal AA; Set it to the
+                                            ///  number of fragments for EQAA
+    /// r800 and later HWL parameters
+    // Needed by 2D tiling, for linear and 1D tiling, just keep them 0's
+    ADDR_TILEINFO*      pTileInfo;          ///< 2D tile parameters. Set to 0 to default/calculate
+    AddrTileType        tileType;           ///< Micro tiling type, not needed when tileIndex != -1
+    INT_32              tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                            ///  while the global useTileIndex is set to 1
+    UINT_32             basePitch;          ///< Base level pitch in pixels, 0 means ignored, is a
+                                            ///  must for mip levels from SI+.
+                                            ///  Don't use pitch in blocks for compressed formats!
+} ADDR_COMPUTE_SURFACE_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_INFO_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeSurfInfo
+*   @note
+        Element: AddrLib unit for computing. e.g. BCn: 4x4 blocks; R32B32B32: 32bit with 3x pitch
+        Pixel: Original pixel
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_INFO_OUTPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    UINT_32         pitch;          ///< Pitch in elements (in blocks for compressed formats)
+    UINT_32         height;         ///< Height in elements (in blocks for compressed formats)
+    UINT_32         depth;          ///< Number of slice/depth
+    UINT_64         surfSize;       ///< Surface size in bytes
+    AddrTileMode    tileMode;       ///< Actual tile mode. May differ from that in input
+    UINT_32         baseAlign;      ///< Base address alignment
+    UINT_32         pitchAlign;     ///< Pitch alignment, in elements
+    UINT_32         heightAlign;    ///< Height alignment, in elements
+    UINT_32         depthAlign;     ///< Depth alignment, aligned to thickness, for 3d texture
+    UINT_32         bpp;            ///< Bits per elements (e.g. blocks for BCn, 1/3 for 96bit)
+    UINT_32         pixelPitch;     ///< Pitch in original pixels
+    UINT_32         pixelHeight;    ///< Height in original pixels
+    UINT_32         pixelBits;      ///< Original bits per pixel, passed from input
+    UINT_64         sliceSize;      ///< Size of slice specified by input's slice
+                                    ///  The result is controlled by surface flags & createFlags
+                                    ///  By default this value equals to surfSize for volume
+    UINT_32         pitchTileMax;   ///< PITCH_TILE_MAX value for h/w register
+    UINT_32         heightTileMax;  ///< HEIGHT_TILE_MAX value for h/w register
+    UINT_32         sliceTileMax;   ///< SLICE_TILE_MAX value for h/w register
+
+    UINT_32         numSamples;     ///< Pass the effective numSamples processed in this call
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;      ///< Tile parameters used. Filled in if 0 on input
+    AddrTileType    tileType;       ///< Micro tiling type, only valid when tileIndex != -1
+    INT_32          tileIndex;      ///< Tile index, MAY be "downgraded"
+
+    INT_32          macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+    /// Special information to work around SI mipmap swizzle bug UBTS #317508
+    BOOL_32         last2DLevel;    ///< TRUE if this is the last 2D(3D) tiled
+                                    ///< Only meaningful when create flag checkLast2DLevel is set
+    /// Stereo info
+    ADDR_QBSTEREOINFO*  pStereoInfo;///< Stereo information, needed when .qbStereo flag is TRUE
+} ADDR_COMPUTE_SURFACE_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceInfo
+*
+*   @brief
+*       Compute surface width/height/depth/alignments and suitable tiling mode
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeSurfaceAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    UINT_32         x;                  ///< X coordinate
+    UINT_32         y;                  ///< Y coordinate
+    UINT_32         slice;              ///< Slice index
+    UINT_32         sample;             ///< Sample index, use fragment index for EQAA
+
+    UINT_32         bpp;                ///< Bits per pixel
+    UINT_32         pitch;              ///< Surface pitch, in pixels
+    UINT_32         height;             ///< Surface height, in pixels
+    UINT_32         numSlices;          ///< Surface depth
+    UINT_32         numSamples;         ///< Number of samples
+
+    AddrTileMode    tileMode;           ///< Tile mode
+    BOOL_32         isDepth;            ///< TRUE if the surface uses depth sample ordering within
+                                        ///  micro tile. Textures can also choose depth sample order
+    UINT_32         tileBase;           ///< Base offset (in bits) inside micro tile which handles
+                                        ///  the case that components are stored separately
+    UINT_32         compBits;           ///< The component bits actually needed(for planar surface)
+
+    UINT_32         numFrags;           ///< Number of fragments, leave it zero or the same as
+                                        ///  number of samples for normal AA; Set it to the
+                                        ///  number of fragments for EQAA
+    /// r800 and later HWL parameters
+    // Used for 1D tiling above
+    AddrTileType    tileType;           ///< See defintion of AddrTileType
+    struct
+    {
+        UINT_32     ignoreSE : 1;       ///< TRUE if shader engines are ignored. This is texture
+                                        ///  only flag. Only non-RT texture can set this to TRUE
+        UINT_32     reserved :31;       ///< Reserved for future use.
+    };
+    // 2D tiling needs following structure
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tile parameters. Client must provide all data
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    union
+    {
+        struct
+        {
+            UINT_32  bankSwizzle;       ///< Bank swizzle
+            UINT_32  pipeSwizzle;       ///< Pipe swizzle
+        };
+        UINT_32     tileSwizzle;        ///< Combined swizzle, if useCombinedSwizzle is TRUE
+    };
+
+#if ADDR_AM_BUILD // These two fields are not valid in SW blt since no HTILE access
+    UINT_32         addr5Swizzle;       ///< ADDR5_SWIZZLE_MASK of DB_DEPTH_INFO
+    BOOL_32         is32ByteTile;       ///< Caller must have access to HTILE buffer and know if
+                                        ///  this tile is compressed to 32B
+#endif
+} ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeSurfaceAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_64 addr;           ///< Byte address
+    UINT_32 bitPosition;    ///< Bit position within surfaceAddr, 0-7.
+                            ///  For surface bpp < 8, e.g. FMT_1.
+    UINT_32 prtBlockIndex;  ///< Index of a PRT tile (64K block)
+} ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Compute surface address from a given coordinate.
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceAddrFromCoord(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeSurfaceCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    UINT_64         addr;               ///< Address in bytes
+    UINT_32         bitPosition;        ///< Bit position in addr. 0-7. for surface bpp < 8,
+                                        ///  e.g. FMT_1;
+    UINT_32         bpp;                ///< Bits per pixel
+    UINT_32         pitch;              ///< Pitch, in pixels
+    UINT_32         height;             ///< Height in pixels
+    UINT_32         numSlices;          ///< Surface depth
+    UINT_32         numSamples;         ///< Number of samples
+
+    AddrTileMode    tileMode;           ///< Tile mode
+    BOOL_32         isDepth;            ///< Surface uses depth sample ordering within micro tile.
+                                        ///  Note: Textures can choose depth sample order as well.
+    UINT_32         tileBase;           ///< Base offset (in bits) inside micro tile which handles
+                                        ///  the case that components are stored separately
+    UINT_32         compBits;           ///< The component bits actually needed(for planar surface)
+
+    UINT_32         numFrags;           ///< Number of fragments, leave it zero or the same as
+                                        ///  number of samples for normal AA; Set it to the
+                                        ///  number of fragments for EQAA
+    /// r800 and later HWL parameters
+    // Used for 1D tiling above
+    AddrTileType    tileType;           ///< See defintion of AddrTileType
+    struct
+    {
+        UINT_32     ignoreSE : 1;       ///< TRUE if shader engines are ignored. This is texture
+                                        ///  only flag. Only non-RT texture can set this to TRUE
+        UINT_32     reserved :31;       ///< Reserved for future use.
+    };
+    // 2D tiling needs following structure
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tile parameters. Client must provide all data
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    union
+    {
+        struct
+        {
+            UINT_32  bankSwizzle;       ///< Bank swizzle
+            UINT_32  pipeSwizzle;       ///< Pipe swizzle
+        };
+        UINT_32     tileSwizzle;        ///< Combined swizzle, if useCombinedSwizzle is TRUE
+    };
+} ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeSurfaceCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT
+{
+    UINT_32 size;   ///< Size of this structure in bytes
+
+    UINT_32 x;      ///< X coordinate
+    UINT_32 y;      ///< Y coordinate
+    UINT_32 slice;  ///< Index of slices
+    UINT_32 sample; ///< Index of samples, means fragment index for EQAA
+} ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeSurfaceCoordFromAddr
+*
+*   @brief
+*       Compute coordinate from a given surface address
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSurfaceCoordFromAddr(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                   HTile functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   ADDR_HTILE_FLAGS
+*
+*   @brief
+*       HTILE flags
+***************************************************************************************************
+*/
+typedef union _ADDR_HTILE_FLAGS
+{
+    struct
+    {
+        UINT_32 tcCompatible  : 1; ///< Flag indicates surface needs to be shader readable
+        UINT_32 reserved      :31; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR_HTILE_FLAGS;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_INFO_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeHtileInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_INFO_INPUT
+{
+    UINT_32            size;            ///< Size of this structure in bytes
+
+    ADDR_HTILE_FLAGS   flags;           ///< HTILE flags
+    UINT_32            pitch;           ///< Surface pitch, in pixels
+    UINT_32            height;          ///< Surface height, in pixels
+    UINT_32            numSlices;       ///< Number of slices
+    BOOL_32            isLinear;        ///< Linear or tiled HTILE layout
+    AddrHtileBlockSize blockWidth;      ///< 4 or 8. EG above only support 8
+    AddrHtileBlockSize blockHeight;     ///< 4 or 8. EG above only support 8
+    ADDR_TILEINFO*     pTileInfo;       ///< Tile info
+
+    INT_32             tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32             macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_HTILE_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_INFO_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeHtileInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_INFO_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 pitch;          ///< Pitch in pixels of depth buffer represented in this
+                            ///  HTile buffer. This might be larger than original depth
+                            ///  buffer pitch when called with an unaligned pitch.
+    UINT_32 height;         ///< Height in pixels, as above
+    UINT_64 htileBytes;     ///< Size of HTILE buffer, in bytes
+    UINT_32 baseAlign;      ///< Base alignment
+    UINT_32 bpp;            ///< Bits per pixel for HTILE is how many bits for an 8x8 block!
+    UINT_32 macroWidth;     ///< Macro width in pixels, actually squared cache shape
+    UINT_32 macroHeight;    ///< Macro height in pixels
+    UINT_64 sliceSize;      ///< Slice size, in bytes.
+} ADDR_COMPUTE_HTILE_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileInfo
+*
+*   @brief
+*       Compute Htile pitch, height, base alignment and size in bytes
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_HTILE_INFO_INPUT*    pIn,
+    ADDR_COMPUTE_HTILE_INFO_OUTPUT*         pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeHtileAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT
+{
+    UINT_32            size;            ///< Size of this structure in bytes
+
+    UINT_32            pitch;           ///< Pitch, in pixels
+    UINT_32            height;          ///< Height in pixels
+    UINT_32            x;               ///< X coordinate
+    UINT_32            y;               ///< Y coordinate
+    UINT_32            slice;           ///< Index of slice
+    UINT_32            numSlices;       ///< Number of slices
+    BOOL_32            isLinear;        ///< Linear or tiled HTILE layout
+    AddrHtileBlockSize blockWidth;      ///< 4 or 8. 1 means 8, 0 means 4. EG above only support 8
+    AddrHtileBlockSize blockHeight;     ///< 4 or 8. 1 means 8, 0 means 4. EG above only support 8
+    ADDR_TILEINFO*     pTileInfo;       ///< Tile info
+
+    INT_32             tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32             macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeHtileAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_64 addr;           ///< Address in bytes
+    UINT_32 bitPosition;    ///< Bit position, 0 or 4. CMASK and HTILE shares some lib method.
+                            ///  So we keep bitPosition for HTILE as well
+} ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileAddrFromCoord
+*
+*   @brief
+*       Compute Htile address according to coordinates (of depth buffer)
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileAddrFromCoord(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*   pIn,
+    ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT*        pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeHtileCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT
+{
+    UINT_32            size;            ///< Size of this structure in bytes
+
+    UINT_64            addr;            ///< Address
+    UINT_32            bitPosition;     ///< Bit position 0 or 4. CMASK and HTILE share some methods
+                                        ///  so we keep bitPosition for HTILE as well
+    UINT_32            pitch;           ///< Pitch, in pixels
+    UINT_32            height;          ///< Height, in pixels
+    UINT_32            numSlices;       ///< Number of slices
+    BOOL_32            isLinear;        ///< Linear or tiled HTILE layout
+    AddrHtileBlockSize blockWidth;      ///< 4 or 8. 1 means 8, 0 means 4. R8xx/R9xx only support 8
+    AddrHtileBlockSize blockHeight;     ///< 4 or 8. 1 means 8, 0 means 4. R8xx/R9xx only support 8
+    ADDR_TILEINFO*     pTileInfo;       ///< Tile info
+
+    INT_32             tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32             macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeHtileCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT
+{
+    UINT_32 size;   ///< Size of this structure in bytes
+
+    UINT_32 x;      ///< X coordinate
+    UINT_32 y;      ///< Y coordinate
+    UINT_32 slice;  ///< Slice index
+} ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeHtileCoordFromAddr
+*
+*   @brief
+*       Compute coordinates within depth buffer (1st pixel of a micro tile) according to
+*       Htile address
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeHtileCoordFromAddr(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT*   pIn,
+    ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT*        pOut);
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     C-mask functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   ADDR_CMASK_FLAGS
+*
+*   @brief
+*       CMASK flags
+***************************************************************************************************
+*/
+typedef union _ADDR_CMASK_FLAGS
+{
+    struct
+    {
+        UINT_32 tcCompatible  : 1; ///< Flag indicates surface needs to be shader readable
+        UINT_32 reserved      :31; ///< Reserved bits
+    };
+
+    UINT_32 value;
+} ADDR_CMASK_FLAGS;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_INFO_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeCmaskInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASKINFO_INPUT
+{
+    UINT_32             size;            ///< Size of this structure in bytes
+
+    ADDR_CMASK_FLAGS    flags;           ///< CMASK flags
+    UINT_32             pitch;           ///< Pitch, in pixels, of color buffer
+    UINT_32             height;          ///< Height, in pixels, of color buffer
+    UINT_32             numSlices;       ///< Number of slices, of color buffer
+    BOOL_32             isLinear;        ///< Linear or tiled layout, Only SI can be linear
+    ADDR_TILEINFO*      pTileInfo;       ///< Tile info
+
+    INT_32              tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                         ///  while the global useTileIndex is set to 1
+    INT_32              macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                         ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_CMASK_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_INFO_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeCmaskInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_INFO_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 pitch;          ///< Pitch in pixels of color buffer which
+                            ///  this Cmask matches. The size might be larger than
+                            ///  original color buffer pitch when called with
+                            ///  an unaligned pitch.
+    UINT_32 height;         ///< Height in pixels, as above
+    UINT_64 cmaskBytes;     ///< Size in bytes of CMask buffer
+    UINT_32 baseAlign;      ///< Base alignment
+    UINT_32 blockMax;       ///< Cmask block size. Need this to set CB_COLORn_MASK register
+    UINT_32 macroWidth;     ///< Macro width in pixels, actually squared cache shape
+    UINT_32 macroHeight;    ///< Macro height in pixels
+    UINT_64 sliceSize;      ///< Slice size, in bytes.
+} ADDR_COMPUTE_CMASK_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskInfo
+*
+*   @brief
+*       Compute Cmask pitch, height, base alignment and size in bytes from color buffer
+*       info
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_CMASK_INFO_INPUT*    pIn,
+    ADDR_COMPUTE_CMASK_INFO_OUTPUT*         pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeCmaskAddrFromCoord
+*
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT
+{
+    UINT_32          size;           ///< Size of this structure in bytes
+    UINT_32          x;              ///< X coordinate
+    UINT_32          y;              ///< Y coordinate
+    UINT_64          fmaskAddr;      ///< Fmask addr for tc compatible Cmask
+    UINT_32          slice;          ///< Slice index
+    UINT_32          pitch;          ///< Pitch in pixels, of color buffer
+    UINT_32          height;         ///< Height in pixels, of color buffer
+    UINT_32          numSlices;      ///< Number of slices
+    UINT_32          bpp;
+    BOOL_32          isLinear;       ///< Linear or tiled layout, Only SI can be linear
+    ADDR_CMASK_FLAGS flags;          ///< CMASK flags
+    ADDR_TILEINFO*   pTileInfo;      ///< Tile info
+
+    INT_32           tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                     ///< while the global useTileIndex is set to 1
+    INT_32           macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+                                     ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeCmaskAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_64 addr;           ///< CMASK address in bytes
+    UINT_32 bitPosition;    ///< Bit position within addr, 0-7. CMASK is 4 bpp,
+                            ///  so the address may be located in bit 0 (0) or 4 (4)
+} ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskAddrFromCoord
+*
+*   @brief
+*       Compute Cmask address according to coordinates (of MSAA color buffer)
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskAddrFromCoord(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*   pIn,
+    ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT*        pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeCmaskCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT
+{
+    UINT_32        size;            ///< Size of this structure in bytes
+
+    UINT_64        addr;            ///< CMASK address in bytes
+    UINT_32        bitPosition;     ///< Bit position within addr, 0-7. CMASK is 4 bpp,
+                                    ///  so the address may be located in bit 0 (0) or 4 (4)
+    UINT_32        pitch;           ///< Pitch, in pixels
+    UINT_32        height;          ///< Height in pixels
+    UINT_32        numSlices;       ///< Number of slices
+    BOOL_32        isLinear;        ///< Linear or tiled layout, Only SI can be linear
+    ADDR_TILEINFO* pTileInfo;       ///< Tile info
+
+    INT_32         tileIndex;       ///< Tile index, MUST be -1 if you don't want to use it
+                                    ///  while the global useTileIndex is set to 1
+    INT_32         macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                    ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeCmaskCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT
+{
+    UINT_32 size;   ///< Size of this structure in bytes
+
+    UINT_32 x;      ///< X coordinate
+    UINT_32 y;      ///< Y coordinate
+    UINT_32 slice;  ///< Slice index
+} ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeCmaskCoordFromAddr
+*
+*   @brief
+*       Compute coordinates within color buffer (1st pixel of a micro tile) according to
+*       Cmask address
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeCmaskCoordFromAddr(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT*   pIn,
+    ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT*        pOut);
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     F-mask functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_INFO_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeFmaskInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_INFO_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    AddrTileMode    tileMode;           ///< Tile mode
+    UINT_32         pitch;              ///< Surface pitch, in pixels
+    UINT_32         height;             ///< Surface height, in pixels
+    UINT_32         numSlices;          ///< Number of slice/depth
+    UINT_32         numSamples;         ///< Number of samples
+    UINT_32         numFrags;           ///< Number of fragments, leave it zero or the same as
+                                        ///  number of samples for normal AA; Set it to the
+                                        ///  number of fragments for EQAA
+    /// r800 and later HWL parameters
+    struct
+    {
+        UINT_32 resolved:   1;          ///< TRUE if the surface is for resolved fmask, only used
+                                        ///  by H/W clients. S/W should always set it to FALSE.
+        UINT_32 reserved:  31;          ///< Reserved for future use.
+    };
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tiling parameters. Clients must give valid data
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+} ADDR_COMPUTE_FMASK_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_INFO_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeFmaskInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_INFO_OUTPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    UINT_32         pitch;          ///< Pitch of fmask in pixels
+    UINT_32         height;         ///< Height of fmask in pixels
+    UINT_32         numSlices;      ///< Slices of fmask
+    UINT_64         fmaskBytes;     ///< Size of fmask in bytes
+    UINT_32         baseAlign;      ///< Base address alignment
+    UINT_32         pitchAlign;     ///< Pitch alignment
+    UINT_32         heightAlign;    ///< Height alignment
+    UINT_32         bpp;            ///< Bits per pixel of FMASK is: number of bit planes
+    UINT_32         numSamples;     ///< Number of samples, used for dump, export this since input
+                                    ///  may be changed in 9xx and above
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;      ///< Tile parameters used. Fmask can have different
+                                    ///  bank_height from color buffer
+    INT_32          tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                    ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+    UINT_64         sliceSize;      ///< Size of slice in bytes
+} ADDR_COMPUTE_FMASK_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskInfo
+*
+*   @brief
+*       Compute Fmask pitch/height/depth/alignments and size in bytes
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeFmaskAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    UINT_32         x;                  ///< X coordinate
+    UINT_32         y;                  ///< Y coordinate
+    UINT_32         slice;              ///< Slice index
+    UINT_32         plane;              ///< Plane number
+    UINT_32         sample;             ///< Sample index (fragment index for EQAA)
+
+    UINT_32         pitch;              ///< Surface pitch, in pixels
+    UINT_32         height;             ///< Surface height, in pixels
+    UINT_32         numSamples;         ///< Number of samples
+    UINT_32         numFrags;           ///< Number of fragments, leave it zero or the same as
+                                        ///  number of samples for normal AA; Set it to the
+                                        ///  number of fragments for EQAA
+
+    AddrTileMode    tileMode;           ///< Tile mode
+    union
+    {
+        struct
+        {
+            UINT_32  bankSwizzle;       ///< Bank swizzle
+            UINT_32  pipeSwizzle;       ///< Pipe swizzle
+        };
+        UINT_32     tileSwizzle;        ///< Combined swizzle, if useCombinedSwizzle is TRUE
+    };
+
+    /// r800 and later HWL parameters
+    struct
+    {
+        UINT_32 resolved:   1;          ///< TRUE if this is a resolved fmask, used by H/W clients
+        UINT_32 ignoreSE:   1;          ///< TRUE if shader engines are ignored.
+        UINT_32 reserved:  30;          ///< Reserved for future use.
+    };
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tiling parameters. Client must provide all data
+
+} ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeFmaskAddrFromCoord
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_64 addr;           ///< Fmask address
+    UINT_32 bitPosition;    ///< Bit position within fmaskAddr, 0-7.
+} ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskAddrFromCoord
+*
+*   @brief
+*       Compute Fmask address according to coordinates (x,y,slice,sample,plane)
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskAddrFromCoord(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT
+*
+*   @brief
+*       Input structure for AddrComputeFmaskCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    UINT_64         addr;               ///< Address
+    UINT_32         bitPosition;        ///< Bit position within addr, 0-7.
+
+    UINT_32         pitch;              ///< Pitch, in pixels
+    UINT_32         height;             ///< Height in pixels
+    UINT_32         numSamples;         ///< Number of samples
+    UINT_32         numFrags;           ///< Number of fragments
+    AddrTileMode    tileMode;           ///< Tile mode
+    union
+    {
+        struct
+        {
+            UINT_32  bankSwizzle;       ///< Bank swizzle
+            UINT_32  pipeSwizzle;       ///< Pipe swizzle
+        };
+        UINT_32     tileSwizzle;        ///< Combined swizzle, if useCombinedSwizzle is TRUE
+    };
+
+    /// r800 and later HWL parameters
+    struct
+    {
+        UINT_32 resolved:   1;          ///< TRUE if this is a resolved fmask, used by HW components
+        UINT_32 ignoreSE:   1;          ///< TRUE if shader engines are ignored.
+        UINT_32 reserved:  30;          ///< Reserved for future use.
+    };
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tile parameters. Client must provide all data
+
+} ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT
+*
+*   @brief
+*       Output structure for AddrComputeFmaskCoordFromAddr
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT
+{
+    UINT_32 size;       ///< Size of this structure in bytes
+
+    UINT_32 x;          ///< X coordinate
+    UINT_32 y;          ///< Y coordinate
+    UINT_32 slice;      ///< Slice index
+    UINT_32 plane;      ///< Plane number
+    UINT_32 sample;     ///< Sample index (fragment index for EQAA)
+} ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeFmaskCoordFromAddr
+*
+*   @brief
+*       Compute FMASK coordinate from an given address
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeFmaskCoordFromAddr(
+    ADDR_HANDLE                                     hLib,
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*   pIn,
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT*        pOut);
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Element/utility functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrGetVersion
+*
+*   @brief
+*       Get AddrLib version number
+***************************************************************************************************
+*/
+UINT_32 ADDR_API AddrGetVersion(ADDR_HANDLE hLib);
+
+/**
+***************************************************************************************************
+*   AddrUseTileIndex
+*
+*   @brief
+*       Return TRUE if tileIndex is enabled in this address library
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API AddrUseTileIndex(ADDR_HANDLE hLib);
+
+/**
+***************************************************************************************************
+*   AddrUseCombinedSwizzle
+*
+*   @brief
+*       Return TRUE if combined swizzle is enabled in this address library
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API AddrUseCombinedSwizzle(ADDR_HANDLE hLib);
+
+/**
+***************************************************************************************************
+*   ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT
+*
+*   @brief
+*       Input structure of AddrExtractBankPipeSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    UINT_32         base256b;       ///< Base256b value
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;      ///< 2D tile parameters. Client must provide all data
+
+    INT_32          tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                    ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+                                    ///< README: When tileIndex is not -1, this must be valid
+} ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT
+*
+*   @brief
+*       Output structure of AddrExtractBankPipeSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 bankSwizzle;    ///< Bank swizzle
+    UINT_32 pipeSwizzle;    ///< Pipe swizzle
+} ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrExtractBankPipeSwizzle
+*
+*   @brief
+*       Extract Bank and Pipe swizzle from base256b
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrExtractBankPipeSwizzle(
+    ADDR_HANDLE                                 hLib,
+    const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT*  pIn,
+    ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT*       pOut);
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT
+*
+*   @brief
+*       Input structure of AddrCombineBankPipeSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    UINT_32         bankSwizzle;    ///< Bank swizzle
+    UINT_32         pipeSwizzle;    ///< Pipe swizzle
+    UINT_64         baseAddr;       ///< Base address (leave it zero for driver clients)
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;      ///< 2D tile parameters. Client must provide all data
+
+    INT_32          tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                    ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+                                    ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT
+*
+*   @brief
+*       Output structure of AddrCombineBankPipeSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 tileSwizzle;    ///< Combined swizzle
+} ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrCombineBankPipeSwizzle
+*
+*   @brief
+*       Combine Bank and Pipe swizzle
+*   @return
+*       ADDR_OK if no error
+*   @note
+*       baseAddr here is full MCAddress instead of base256b
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrCombineBankPipeSwizzle(
+    ADDR_HANDLE                                 hLib,
+    const ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT*       pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SLICESWIZZLE_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeSliceSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SLICESWIZZLE_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    AddrTileMode    tileMode;           ///< Tile Mode
+    UINT_32         baseSwizzle;        ///< Base tile swizzle
+    UINT_32         slice;              ///< Slice index
+    UINT_64         baseAddr;           ///< Base address, driver should leave it 0 in most cases
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;          ///< 2D tile parameters. Actually banks needed here!
+
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex;     ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_SLICESWIZZLE_INPUT;
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_SLICESWIZZLE_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeSliceSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_SLICESWIZZLE_OUTPUT
+{
+    UINT_32  size;           ///< Size of this structure in bytes
+
+    UINT_32  tileSwizzle;    ///< Recalculated tileSwizzle value
+} ADDR_COMPUTE_SLICESWIZZLE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeSliceSwizzle
+*
+*   @brief
+*       Extract Bank and Pipe swizzle from base256b
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeSliceSwizzle(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_SLICESWIZZLE_INPUT*  pIn,
+    ADDR_COMPUTE_SLICESWIZZLE_OUTPUT*       pOut);
+
+
+/**
+***************************************************************************************************
+*   AddrSwizzleGenOption
+*
+*   @brief
+*       Which swizzle generating options: legacy or linear
+***************************************************************************************************
+*/
+typedef enum _AddrSwizzleGenOption
+{
+    ADDR_SWIZZLE_GEN_DEFAULT    = 0,    ///< As is in client driver implemention for swizzle
+    ADDR_SWIZZLE_GEN_LINEAR     = 1,    ///< Using a linear increment of swizzle
+} AddrSwizzleGenOption;
+
+/**
+***************************************************************************************************
+*   AddrSwizzleOption
+*
+*   @brief
+*       Controls how swizzle is generated
+***************************************************************************************************
+*/
+typedef union _ADDR_SWIZZLE_OPTION
+{
+    struct
+    {
+        UINT_32 genOption       : 1;    ///< The way swizzle is generated, see AddrSwizzleGenOption
+        UINT_32 reduceBankBit   : 1;    ///< TRUE if we need reduce swizzle bits
+        UINT_32 reserved        :30;    ///< Reserved bits
+    };
+
+    UINT_32 value;
+
+} ADDR_SWIZZLE_OPTION;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_BASE_SWIZZLE_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeBaseSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_BASE_SWIZZLE_INPUT
+{
+    UINT_32             size;           ///< Size of this structure in bytes
+
+    ADDR_SWIZZLE_OPTION option;         ///< Swizzle option
+    UINT_32             surfIndex;      ///< Index of this surface type
+    AddrTileMode        tileMode;       ///< Tile Mode
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*      pTileInfo;      ///< 2D tile parameters. Actually banks needed here!
+
+    INT_32              tileIndex;      ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32              macroModeIndex; ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_BASE_SWIZZLE_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeBaseSwizzle
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_32 tileSwizzle;    ///< Combined swizzle
+} ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeBaseSwizzle
+*
+*   @brief
+*       Return a Combined Bank and Pipe swizzle base on surface based on surface type/index
+*   @return
+*       ADDR_OK if no error
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeBaseSwizzle(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_BASE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT*       pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ELEM_GETEXPORTNORM_INPUT
+*
+*   @brief
+*       Input structure for ElemGetExportNorm
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_GETEXPORTNORM_INPUT
+{
+    UINT_32             size;       ///< Size of this structure in bytes
+
+    AddrColorFormat     format;     ///< Color buffer format; Client should use ColorFormat
+    AddrSurfaceNumber   num;        ///< Surface number type; Client should use NumberType
+    AddrSurfaceSwap     swap;       ///< Surface swap byte swap; Client should use SurfaceSwap
+    UINT_32             numSamples; ///< Number of samples
+} ELEM_GETEXPORTNORM_INPUT;
+
+/**
+***************************************************************************************************
+*  ElemGetExportNorm
+*
+*   @brief
+*       Helper function to check one format can be EXPORT_NUM, which is a register
+*       CB_COLOR_INFO.SURFACE_FORMAT. FP16 can be reported as EXPORT_NORM for rv770 in r600
+*       family
+*   @note
+*       The implementation is only for r600.
+*       00 - EXPORT_FULL: PS exports are 4 pixels with 4 components with 32-bits-per-component. (two
+*       clocks per export)
+*       01 - EXPORT_NORM: PS exports are 4 pixels with 4 components with 16-bits-per-component. (one
+*       clock per export)
+*
+***************************************************************************************************
+*/
+BOOL_32 ADDR_API ElemGetExportNorm(
+    ADDR_HANDLE                     hLib,
+    const ELEM_GETEXPORTNORM_INPUT* pIn);
+
+
+
+/**
+***************************************************************************************************
+*   ELEM_FLT32TODEPTHPIXEL_INPUT
+*
+*   @brief
+*       Input structure for addrFlt32ToDepthPixel
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_FLT32TODEPTHPIXEL_INPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    AddrDepthFormat format;         ///< Depth buffer format
+    ADDR_FLT_32     comps[2];       ///< Component values (Z/stencil)
+} ELEM_FLT32TODEPTHPIXEL_INPUT;
+
+/**
+***************************************************************************************************
+*   ELEM_FLT32TODEPTHPIXEL_INPUT
+*
+*   @brief
+*       Output structure for ElemFlt32ToDepthPixel
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_FLT32TODEPTHPIXEL_OUTPUT
+{
+    UINT_32 size;           ///< Size of this structure in bytes
+
+    UINT_8* pPixel;         ///< Real depth value. Same data type as depth buffer.
+                            ///  Client must provide enough storage for this type.
+    UINT_32 depthBase;      ///< Tile base in bits for depth bits
+    UINT_32 stencilBase;    ///< Tile base in bits for stencil bits
+    UINT_32 depthBits;      ///< Bits for depth
+    UINT_32 stencilBits;    ///< Bits for stencil
+} ELEM_FLT32TODEPTHPIXEL_OUTPUT;
+
+/**
+***************************************************************************************************
+*   ElemFlt32ToDepthPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a depth/stencil pixel value
+*
+*   @return
+*       Return code
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API ElemFlt32ToDepthPixel(
+    ADDR_HANDLE                         hLib,
+    const ELEM_FLT32TODEPTHPIXEL_INPUT* pIn,
+    ELEM_FLT32TODEPTHPIXEL_OUTPUT*      pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ELEM_FLT32TOCOLORPIXEL_INPUT
+*
+*   @brief
+*       Input structure for addrFlt32ToColorPixel
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_FLT32TOCOLORPIXEL_INPUT
+{
+    UINT_32            size;           ///< Size of this structure in bytes
+
+    AddrColorFormat    format;         ///< Color buffer format
+    AddrSurfaceNumber  surfNum;        ///< Surface number
+    AddrSurfaceSwap    surfSwap;       ///< Surface swap
+    ADDR_FLT_32        comps[4];       ///< Component values (r/g/b/a)
+} ELEM_FLT32TOCOLORPIXEL_INPUT;
+
+/**
+***************************************************************************************************
+*   ELEM_FLT32TOCOLORPIXEL_INPUT
+*
+*   @brief
+*       Output structure for ElemFlt32ToColorPixel
+*
+***************************************************************************************************
+*/
+typedef struct _ELEM_FLT32TOCOLORPIXEL_OUTPUT
+{
+    UINT_32 size;       ///< Size of this structure in bytes
+
+    UINT_8* pPixel;     ///< Real color value. Same data type as color buffer.
+                        ///  Client must provide enough storage for this type.
+} ELEM_FLT32TOCOLORPIXEL_OUTPUT;
+
+/**
+***************************************************************************************************
+*   ElemFlt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a red/green/blue/alpha pixel value
+*
+*   @return
+*       Return code
+*
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API ElemFlt32ToColorPixel(
+    ADDR_HANDLE                         hLib,
+    const ELEM_FLT32TOCOLORPIXEL_INPUT* pIn,
+    ELEM_FLT32TOCOLORPIXEL_OUTPUT*      pOut);
+
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINFOTOHW_INPUT
+*
+*   @brief
+*       Input structure for AddrConvertTileInfoToHW
+*   @note
+*       When reverse is TRUE, indices are igonred
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINFOTOHW_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+    BOOL_32         reverse;            ///< Convert control flag.
+                                        ///  FALSE: convert from real value to HW value;
+                                        ///  TRUE: convert from HW value to real value.
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*  pTileInfo;          ///< Tile parameters with real value
+
+    INT_32          tileIndex;          ///< Tile index, MUST be -1 if you don't want to use it
+                                        ///  while the global useTileIndex is set to 1
+    INT_32          macroModeIndex;     ///< Index in macro tile mode table if there is one (CI)
+                                        ///< README: When tileIndex is not -1, this must be valid
+} ADDR_CONVERT_TILEINFOTOHW_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINFOTOHW_OUTPUT
+*
+*   @brief
+*       Output structure for AddrConvertTileInfoToHW
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINFOTOHW_OUTPUT
+{
+    UINT_32             size;               ///< Size of this structure in bytes
+
+    /// r800 and later HWL parameters
+    ADDR_TILEINFO*      pTileInfo;          ///< Tile parameters with hardware register value
+
+} ADDR_CONVERT_TILEINFOTOHW_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrConvertTileInfoToHW
+*
+*   @brief
+*       Convert tile info from real value to hardware register value
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileInfoToHW(
+    ADDR_HANDLE                             hLib,
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT*  pIn,
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT*       pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINDEX_INPUT
+*
+*   @brief
+*       Input structure for AddrConvertTileIndex
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINDEX_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    INT_32          tileIndex;          ///< Tile index
+    INT_32          macroModeIndex;     ///< Index in macro tile mode table if there is one (CI)
+    BOOL_32         tileInfoHw;         ///< Set to TRUE if client wants HW enum, otherwise actual
+} ADDR_CONVERT_TILEINDEX_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINDEX_OUTPUT
+*
+*   @brief
+*       Output structure for AddrConvertTileIndex
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINDEX_OUTPUT
+{
+    UINT_32             size;           ///< Size of this structure in bytes
+
+    AddrTileMode        tileMode;       ///< Tile mode
+    AddrTileType        tileType;       ///< Tile type
+    ADDR_TILEINFO*      pTileInfo;      ///< Tile info
+
+} ADDR_CONVERT_TILEINDEX_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrConvertTileIndex
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileIndex(
+    ADDR_HANDLE                         hLib,
+    const ADDR_CONVERT_TILEINDEX_INPUT* pIn,
+    ADDR_CONVERT_TILEINDEX_OUTPUT*      pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_CONVERT_TILEINDEX1_INPUT
+*
+*   @brief
+*       Input structure for AddrConvertTileIndex1 (without macro mode index)
+***************************************************************************************************
+*/
+typedef struct _ADDR_CONVERT_TILEINDEX1_INPUT
+{
+    UINT_32         size;               ///< Size of this structure in bytes
+
+    INT_32          tileIndex;          ///< Tile index
+    UINT_32         bpp;                ///< Bits per pixel
+    UINT_32         numSamples;         ///< Number of samples
+    BOOL_32         tileInfoHw;         ///< Set to TRUE if client wants HW enum, otherwise actual
+} ADDR_CONVERT_TILEINDEX1_INPUT;
+
+/**
+***************************************************************************************************
+*   AddrConvertTileIndex1
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrConvertTileIndex1(
+    ADDR_HANDLE                             hLib,
+    const ADDR_CONVERT_TILEINDEX1_INPUT*    pIn,
+    ADDR_CONVERT_TILEINDEX_OUTPUT*          pOut);
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_GET_TILEINDEX_INPUT
+*
+*   @brief
+*       Input structure for AddrGetTileIndex
+***************************************************************************************************
+*/
+typedef struct _ADDR_GET_TILEINDEX_INPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    AddrTileMode    tileMode;       ///< Tile mode
+    AddrTileType    tileType;       ///< Tile-type: disp/non-disp/...
+    ADDR_TILEINFO*  pTileInfo;      ///< Pointer to tile-info structure, can be NULL for linear/1D
+} ADDR_GET_TILEINDEX_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_GET_TILEINDEX_OUTPUT
+*
+*   @brief
+*       Output structure for AddrGetTileIndex
+***************************************************************************************************
+*/
+typedef struct _ADDR_GET_TILEINDEX_OUTPUT
+{
+    UINT_32         size;           ///< Size of this structure in bytes
+
+    INT_32          index;          ///< index in table
+} ADDR_GET_TILEINDEX_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrGetTileIndex
+*
+*   @brief
+*       Get the tiling mode index in table
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrGetTileIndex(
+    ADDR_HANDLE                     hLib,
+    const ADDR_GET_TILEINDEX_INPUT* pIn,
+    ADDR_GET_TILEINDEX_OUTPUT*      pOut);
+
+
+
+
+/**
+***************************************************************************************************
+*   ADDR_PRT_INFO_INPUT
+*
+*   @brief
+*       Input structure for AddrComputePrtInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_PRT_INFO_INPUT
+{
+    AddrFormat          format;        ///< Surface format
+    UINT_32             baseMipWidth;  ///< Base mipmap width
+    UINT_32             baseMipHeight; ///< Base mipmap height
+    UINT_32             baseMipDepth;  ///< Base mipmap depth
+    UINT_32             numFrags;      ///< Number of fragments,
+} ADDR_PRT_INFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_PRT_INFO_OUTPUT
+*
+*   @brief
+*       Input structure for AddrComputePrtInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_PRT_INFO_OUTPUT
+{
+    UINT_32             prtTileWidth;
+    UINT_32             prtTileHeight;
+} ADDR_PRT_INFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputePrtInfo
+*
+*   @brief
+*       Compute prt surface related information
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputePrtInfo(
+    ADDR_HANDLE                 hLib,
+    const ADDR_PRT_INFO_INPUT*  pIn,
+    ADDR_PRT_INFO_OUTPUT*       pOut);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                                     DCC key functions
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   _ADDR_COMPUTE_DCCINFO_INPUT
+*
+*   @brief
+*       Input structure of AddrComputeDccInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_DCCINFO_INPUT
+{
+    UINT_32             size;            ///< Size of this structure in bytes
+    UINT_32             bpp;             ///< BitPP of color surface
+    UINT_32             numSamples;      ///< Sample number of color surface
+    UINT_64             colorSurfSize;   ///< Size of color surface to which dcc key is bound
+    AddrTileMode        tileMode;        ///< Tile mode of color surface
+    ADDR_TILEINFO       tileInfo;        ///< Tile info of color surface
+    UINT_32             tileSwizzle;     ///< Tile swizzle
+    INT_32              tileIndex;       ///< Tile index of color surface,
+                                         ///< MUST be -1 if you don't want to use it
+                                         ///< while the global useTileIndex is set to 1
+    INT_32              macroModeIndex;  ///< Index in macro tile mode table if there is one (CI)
+                                         ///< README: When tileIndex is not -1, this must be valid
+} ADDR_COMPUTE_DCCINFO_INPUT;
+
+/**
+***************************************************************************************************
+*   ADDR_COMPUTE_DCCINFO_OUTPUT
+*
+*   @brief
+*       Output structure of AddrComputeDccInfo
+***************************************************************************************************
+*/
+typedef struct _ADDR_COMPUTE_DCCINFO_OUTPUT
+{
+    UINT_32 size;                 ///< Size of this structure in bytes
+    UINT_64 dccRamBaseAlign;      ///< Base alignment of dcc key
+    UINT_64 dccRamSize;           ///< Size of dcc key
+    UINT_64 dccFastClearSize;     ///< Size of dcc key portion that can be fast cleared
+    BOOL_32 subLvlCompressible;   ///< whether sub resource is compressiable
+} ADDR_COMPUTE_DCCINFO_OUTPUT;
+
+/**
+***************************************************************************************************
+*   AddrComputeDccInfo
+*
+*   @brief
+*       Compute DCC key size, base alignment
+*       info
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE ADDR_API AddrComputeDccInfo(
+    ADDR_HANDLE                             hLib,
+    const ADDR_COMPUTE_DCCINFO_INPUT*       pIn,
+    ADDR_COMPUTE_DCCINFO_OUTPUT*            pOut);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // __ADDR_INTERFACE_H__
+
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/addrtypes.h b/src/gallium/winsys/amdgpu/drm/addrlib/addrtypes.h
new file mode 100644
index 00000000000..4c68ac544b8
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/addrtypes.h
@@ -0,0 +1,590 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrtypes.h
+* @brief Contains the helper function and constants
+***************************************************************************************************
+*/
+#ifndef __ADDR_TYPES_H__
+#define __ADDR_TYPES_H__
+
+#if defined(__APPLE__) || defined(TCORE_BUILD)
+// External definitions header maintained by Mac driver team (and TCORE team)
+// Helps address compilation issues & reduces code covered by NDA
+#include "addrExtDef.h"
+
+#else
+
+// Windows and/or Linux
+#if !defined(VOID)
+typedef void           VOID;
+#endif
+
+#if !defined(FLOAT)
+typedef float          FLOAT;
+#endif
+
+#if !defined(CHAR)
+typedef char           CHAR;
+#endif
+
+#if !defined(INT)
+typedef int            INT;
+#endif
+
+#include <stdarg.h> // va_list...etc need this header
+
+#endif // defined (__APPLE__)
+
+/**
+***************************************************************************************************
+*   Calling conventions
+***************************************************************************************************
+*/
+#ifndef ADDR_CDECL
+    #if defined(__GNUC__)
+        #define ADDR_CDECL __attribute__((cdecl))
+    #else
+        #define ADDR_CDECL __cdecl
+    #endif
+#endif
+
+#ifndef ADDR_STDCALL
+    #if defined(__GNUC__)
+        #if defined(__AMD64__)
+            #define ADDR_STDCALL
+        #else
+            #define ADDR_STDCALL __attribute__((stdcall))
+        #endif
+    #else
+        #define ADDR_STDCALL __stdcall
+    #endif
+#endif
+
+#ifndef ADDR_FASTCALL
+    #if defined(__GNUC__)
+        #define ADDR_FASTCALL __attribute__((regparm(0)))
+    #else
+        #define ADDR_FASTCALL __fastcall
+    #endif
+#endif
+
+#ifndef GC_CDECL
+    #define GC_CDECL  ADDR_CDECL
+#endif
+
+#ifndef GC_STDCALL
+    #define GC_STDCALL  ADDR_STDCALL
+#endif
+
+#ifndef GC_FASTCALL
+    #define GC_FASTCALL  ADDR_FASTCALL
+#endif
+
+
+#if defined(__GNUC__)
+    #define ADDR_INLINE static inline   // inline needs to be static to link
+#else
+    // win32, win64, other platforms
+    #define ADDR_INLINE   __inline
+#endif // #if defined(__GNUC__)
+
+#define ADDR_API ADDR_FASTCALL //default call convention is fast call
+
+/**
+***************************************************************************************************
+* Global defines used by other modules
+***************************************************************************************************
+*/
+#if !defined(TILEINDEX_INVALID)
+#define TILEINDEX_INVALID                -1
+#endif
+
+#if !defined(TILEINDEX_LINEAR_GENERAL)
+#define TILEINDEX_LINEAR_GENERAL         -2
+#endif
+
+#if !defined(TILEINDEX_LINEAR_ALIGNED)
+#define TILEINDEX_LINEAR_ALIGNED          8
+#endif
+
+/**
+***************************************************************************************************
+* Return codes
+***************************************************************************************************
+*/
+typedef enum _ADDR_E_RETURNCODE
+{
+    // General Return
+    ADDR_OK    = 0,
+    ADDR_ERROR = 1,
+
+    // Specific Errors
+    ADDR_OUTOFMEMORY,
+    ADDR_INVALIDPARAMS,
+    ADDR_NOTSUPPORTED,
+    ADDR_NOTIMPLEMENTED,
+    ADDR_PARAMSIZEMISMATCH,
+    ADDR_INVALIDGBREGVALUES,
+
+} ADDR_E_RETURNCODE;
+
+/**
+***************************************************************************************************
+* @brief
+*   Neutral enums that define tile modes for all H/W
+* @note
+*   R600/R800 tiling mode can be cast to hw enums directly but never cast into HW enum from
+*   ADDR_TM_2D_TILED_XTHICK
+*
+***************************************************************************************************
+*/
+typedef enum _AddrTileMode
+{
+    ADDR_TM_LINEAR_GENERAL      = 0,    ///< Least restrictions, pitch: multiple of 8 if not buffer
+    ADDR_TM_LINEAR_ALIGNED      = 1,    ///< Requests pitch or slice to be multiple of 64 pixels
+    ADDR_TM_1D_TILED_THIN1      = 2,    ///< Linear array of 8x8 tiles
+    ADDR_TM_1D_TILED_THICK      = 3,    ///< Linear array of 8x8x4 tiles
+    ADDR_TM_2D_TILED_THIN1      = 4,    ///< A set of macro tiles consist of 8x8 tiles
+    ADDR_TM_2D_TILED_THIN2      = 5,    ///< 600 HWL only, macro tile ratio is 1:4
+    ADDR_TM_2D_TILED_THIN4      = 6,    ///< 600 HWL only, macro tile ratio is 1:16
+    ADDR_TM_2D_TILED_THICK      = 7,    ///< A set of macro tiles consist of 8x8x4 tiles
+    ADDR_TM_2B_TILED_THIN1      = 8,    ///< 600 HWL only, with bank swap
+    ADDR_TM_2B_TILED_THIN2      = 9,    ///< 600 HWL only, with bank swap and ratio is 1:4
+    ADDR_TM_2B_TILED_THIN4      = 10,   ///< 600 HWL only, with bank swap and ratio is 1:16
+    ADDR_TM_2B_TILED_THICK      = 11,   ///< 600 HWL only, with bank swap, consists of 8x8x4 tiles
+    ADDR_TM_3D_TILED_THIN1      = 12,   ///< Macro tiling w/ pipe rotation between slices
+    ADDR_TM_3D_TILED_THICK      = 13,   ///< Macro tiling w/ pipe rotation bwtween slices, thick
+    ADDR_TM_3B_TILED_THIN1      = 14,   ///< 600 HWL only, with bank swap
+    ADDR_TM_3B_TILED_THICK      = 15,   ///< 600 HWL only, with bank swap, thick
+    ADDR_TM_2D_TILED_XTHICK     = 16,   ///< Tile is 8x8x8, valid from NI
+    ADDR_TM_3D_TILED_XTHICK     = 17,   ///< Tile is 8x8x8, valid from NI
+    ADDR_TM_POWER_SAVE          = 18,   ///< Power save mode, only used by KMD on NI
+    ADDR_TM_PRT_TILED_THIN1     = 19,   ///< No bank/pipe rotation or hashing beyond macrotile size
+    ADDR_TM_PRT_2D_TILED_THIN1  = 20,   ///< Same as 2D_TILED_THIN1, PRT only
+    ADDR_TM_PRT_3D_TILED_THIN1  = 21,   ///< Same as 3D_TILED_THIN1, PRT only
+    ADDR_TM_PRT_TILED_THICK     = 22,   ///< No bank/pipe rotation or hashing beyond macrotile size
+    ADDR_TM_PRT_2D_TILED_THICK  = 23,   ///< Same as 2D_TILED_THICK, PRT only
+    ADDR_TM_PRT_3D_TILED_THICK  = 24,   ///< Same as 3D_TILED_THICK, PRT only
+    ADDR_TM_COUNT               = 25,   ///< Must be the value of the last tile mode
+} AddrTileMode;
+
+/**
+***************************************************************************************************
+*   AddrFormat
+*
+*   @brief
+*       Neutral enum for SurfaceFormat
+*
+***************************************************************************************************
+*/
+typedef enum _AddrFormat {
+    ADDR_FMT_INVALID                              = 0x00000000,
+    ADDR_FMT_8                                    = 0x00000001,
+    ADDR_FMT_4_4                                  = 0x00000002,
+    ADDR_FMT_3_3_2                                = 0x00000003,
+    ADDR_FMT_RESERVED_4                           = 0x00000004,
+    ADDR_FMT_16                                   = 0x00000005,
+    ADDR_FMT_16_FLOAT                             = 0x00000006,
+    ADDR_FMT_8_8                                  = 0x00000007,
+    ADDR_FMT_5_6_5                                = 0x00000008,
+    ADDR_FMT_6_5_5                                = 0x00000009,
+    ADDR_FMT_1_5_5_5                              = 0x0000000a,
+    ADDR_FMT_4_4_4_4                              = 0x0000000b,
+    ADDR_FMT_5_5_5_1                              = 0x0000000c,
+    ADDR_FMT_32                                   = 0x0000000d,
+    ADDR_FMT_32_FLOAT                             = 0x0000000e,
+    ADDR_FMT_16_16                                = 0x0000000f,
+    ADDR_FMT_16_16_FLOAT                          = 0x00000010,
+    ADDR_FMT_8_24                                 = 0x00000011,
+    ADDR_FMT_8_24_FLOAT                           = 0x00000012,
+    ADDR_FMT_24_8                                 = 0x00000013,
+    ADDR_FMT_24_8_FLOAT                           = 0x00000014,
+    ADDR_FMT_10_11_11                             = 0x00000015,
+    ADDR_FMT_10_11_11_FLOAT                       = 0x00000016,
+    ADDR_FMT_11_11_10                             = 0x00000017,
+    ADDR_FMT_11_11_10_FLOAT                       = 0x00000018,
+    ADDR_FMT_2_10_10_10                           = 0x00000019,
+    ADDR_FMT_8_8_8_8                              = 0x0000001a,
+    ADDR_FMT_10_10_10_2                           = 0x0000001b,
+    ADDR_FMT_X24_8_32_FLOAT                       = 0x0000001c,
+    ADDR_FMT_32_32                                = 0x0000001d,
+    ADDR_FMT_32_32_FLOAT                          = 0x0000001e,
+    ADDR_FMT_16_16_16_16                          = 0x0000001f,
+    ADDR_FMT_16_16_16_16_FLOAT                    = 0x00000020,
+    ADDR_FMT_RESERVED_33                          = 0x00000021,
+    ADDR_FMT_32_32_32_32                          = 0x00000022,
+    ADDR_FMT_32_32_32_32_FLOAT                    = 0x00000023,
+    ADDR_FMT_RESERVED_36                          = 0x00000024,
+    ADDR_FMT_1                                    = 0x00000025,
+    ADDR_FMT_1_REVERSED                           = 0x00000026,
+    ADDR_FMT_GB_GR                                = 0x00000027,
+    ADDR_FMT_BG_RG                                = 0x00000028,
+    ADDR_FMT_32_AS_8                              = 0x00000029,
+    ADDR_FMT_32_AS_8_8                            = 0x0000002a,
+    ADDR_FMT_5_9_9_9_SHAREDEXP                    = 0x0000002b,
+    ADDR_FMT_8_8_8                                = 0x0000002c,
+    ADDR_FMT_16_16_16                             = 0x0000002d,
+    ADDR_FMT_16_16_16_FLOAT                       = 0x0000002e,
+    ADDR_FMT_32_32_32                             = 0x0000002f,
+    ADDR_FMT_32_32_32_FLOAT                       = 0x00000030,
+    ADDR_FMT_BC1                                  = 0x00000031,
+    ADDR_FMT_BC2                                  = 0x00000032,
+    ADDR_FMT_BC3                                  = 0x00000033,
+    ADDR_FMT_BC4                                  = 0x00000034,
+    ADDR_FMT_BC5                                  = 0x00000035,
+    ADDR_FMT_BC6                                  = 0x00000036,
+    ADDR_FMT_BC7                                  = 0x00000037,
+    ADDR_FMT_32_AS_32_32_32_32                    = 0x00000038,
+    ADDR_FMT_APC3                                 = 0x00000039,
+    ADDR_FMT_APC4                                 = 0x0000003a,
+    ADDR_FMT_APC5                                 = 0x0000003b,
+    ADDR_FMT_APC6                                 = 0x0000003c,
+    ADDR_FMT_APC7                                 = 0x0000003d,
+    ADDR_FMT_CTX1                                 = 0x0000003e,
+    ADDR_FMT_RESERVED_63                          = 0x0000003f,
+} AddrFormat;
+
+/**
+***************************************************************************************************
+*   AddrDepthFormat
+*
+*   @brief
+*       Neutral enum for addrFlt32ToDepthPixel
+*
+***************************************************************************************************
+*/
+typedef enum _AddrDepthFormat
+{
+    ADDR_DEPTH_INVALID                            = 0x00000000,
+    ADDR_DEPTH_16                                 = 0x00000001,
+    ADDR_DEPTH_X8_24                              = 0x00000002,
+    ADDR_DEPTH_8_24                               = 0x00000003,
+    ADDR_DEPTH_X8_24_FLOAT                        = 0x00000004,
+    ADDR_DEPTH_8_24_FLOAT                         = 0x00000005,
+    ADDR_DEPTH_32_FLOAT                           = 0x00000006,
+    ADDR_DEPTH_X24_8_32_FLOAT                     = 0x00000007,
+
+} AddrDepthFormat;
+
+/**
+***************************************************************************************************
+*   AddrColorFormat
+*
+*   @brief
+*       Neutral enum for ColorFormat
+*
+***************************************************************************************************
+*/
+typedef enum _AddrColorFormat
+{
+    ADDR_COLOR_INVALID                            = 0x00000000,
+    ADDR_COLOR_8                                  = 0x00000001,
+    ADDR_COLOR_4_4                                = 0x00000002,
+    ADDR_COLOR_3_3_2                              = 0x00000003,
+    ADDR_COLOR_RESERVED_4                         = 0x00000004,
+    ADDR_COLOR_16                                 = 0x00000005,
+    ADDR_COLOR_16_FLOAT                           = 0x00000006,
+    ADDR_COLOR_8_8                                = 0x00000007,
+    ADDR_COLOR_5_6_5                              = 0x00000008,
+    ADDR_COLOR_6_5_5                              = 0x00000009,
+    ADDR_COLOR_1_5_5_5                            = 0x0000000a,
+    ADDR_COLOR_4_4_4_4                            = 0x0000000b,
+    ADDR_COLOR_5_5_5_1                            = 0x0000000c,
+    ADDR_COLOR_32                                 = 0x0000000d,
+    ADDR_COLOR_32_FLOAT                           = 0x0000000e,
+    ADDR_COLOR_16_16                              = 0x0000000f,
+    ADDR_COLOR_16_16_FLOAT                        = 0x00000010,
+    ADDR_COLOR_8_24                               = 0x00000011,
+    ADDR_COLOR_8_24_FLOAT                         = 0x00000012,
+    ADDR_COLOR_24_8                               = 0x00000013,
+    ADDR_COLOR_24_8_FLOAT                         = 0x00000014,
+    ADDR_COLOR_10_11_11                           = 0x00000015,
+    ADDR_COLOR_10_11_11_FLOAT                     = 0x00000016,
+    ADDR_COLOR_11_11_10                           = 0x00000017,
+    ADDR_COLOR_11_11_10_FLOAT                     = 0x00000018,
+    ADDR_COLOR_2_10_10_10                         = 0x00000019,
+    ADDR_COLOR_8_8_8_8                            = 0x0000001a,
+    ADDR_COLOR_10_10_10_2                         = 0x0000001b,
+    ADDR_COLOR_X24_8_32_FLOAT                     = 0x0000001c,
+    ADDR_COLOR_32_32                              = 0x0000001d,
+    ADDR_COLOR_32_32_FLOAT                        = 0x0000001e,
+    ADDR_COLOR_16_16_16_16                        = 0x0000001f,
+    ADDR_COLOR_16_16_16_16_FLOAT                  = 0x00000020,
+    ADDR_COLOR_RESERVED_33                        = 0x00000021,
+    ADDR_COLOR_32_32_32_32                        = 0x00000022,
+    ADDR_COLOR_32_32_32_32_FLOAT                  = 0x00000023,
+} AddrColorFormat;
+
+/**
+***************************************************************************************************
+*   AddrSurfaceNumber
+*
+*   @brief
+*       Neutral enum for SurfaceNumber
+*
+***************************************************************************************************
+*/
+typedef enum _AddrSurfaceNumber {
+    ADDR_NUMBER_UNORM                             = 0x00000000,
+    ADDR_NUMBER_SNORM                             = 0x00000001,
+    ADDR_NUMBER_USCALED                           = 0x00000002,
+    ADDR_NUMBER_SSCALED                           = 0x00000003,
+    ADDR_NUMBER_UINT                              = 0x00000004,
+    ADDR_NUMBER_SINT                              = 0x00000005,
+    ADDR_NUMBER_SRGB                              = 0x00000006,
+    ADDR_NUMBER_FLOAT                             = 0x00000007,
+} AddrSurfaceNumber;
+
+/**
+***************************************************************************************************
+*   AddrSurfaceSwap
+*
+*   @brief
+*       Neutral enum for SurfaceSwap
+*
+***************************************************************************************************
+*/
+typedef enum _AddrSurfaceSwap {
+    ADDR_SWAP_STD                                 = 0x00000000,
+    ADDR_SWAP_ALT                                 = 0x00000001,
+    ADDR_SWAP_STD_REV                             = 0x00000002,
+    ADDR_SWAP_ALT_REV                             = 0x00000003,
+} AddrSurfaceSwap;
+
+/**
+***************************************************************************************************
+*   AddrHtileBlockSize
+*
+*   @brief
+*       Size of HTILE blocks, valid values are 4 or 8 for now
+***************************************************************************************************
+*/
+typedef enum _AddrHtileBlockSize
+{
+    ADDR_HTILE_BLOCKSIZE_4 = 4,
+    ADDR_HTILE_BLOCKSIZE_8 = 8,
+} AddrHtileBlockSize;
+
+
+/**
+***************************************************************************************************
+*   AddrPipeCfg
+*
+*   @brief
+*       The pipe configuration field specifies both the number of pipes and
+*       how pipes are interleaved on the surface.
+*       The expression of number of pipes, the shader engine tile size, and packer tile size
+*       is encoded in a PIPE_CONFIG register field.
+*       In general the number of pipes usually matches the number of memory channels of the
+*       hardware configuration.
+*       For hw configurations w/ non-pow2 memory number of memory channels, it usually matches
+*       the number of ROP units(? TODO: which registers??)
+*       The enum value = hw enum + 1 which is to reserve 0 for requesting default.
+***************************************************************************************************
+*/
+typedef enum _AddrPipeCfg
+{
+    ADDR_PIPECFG_INVALID         = 0,
+    ADDR_PIPECFG_P2              = 1, /// 2 pipes,
+    ADDR_PIPECFG_P4_8x16         = 5, /// 4 pipes,
+    ADDR_PIPECFG_P4_16x16        = 6,
+    ADDR_PIPECFG_P4_16x32        = 7,
+    ADDR_PIPECFG_P4_32x32        = 8,
+    ADDR_PIPECFG_P8_16x16_8x16   = 9, /// 8 pipes
+    ADDR_PIPECFG_P8_16x32_8x16   = 10,
+    ADDR_PIPECFG_P8_32x32_8x16   = 11,
+    ADDR_PIPECFG_P8_16x32_16x16  = 12,
+    ADDR_PIPECFG_P8_32x32_16x16  = 13,
+    ADDR_PIPECFG_P8_32x32_16x32  = 14,
+    ADDR_PIPECFG_P8_32x64_32x32  = 15,
+    ADDR_PIPECFG_P16_32x32_8x16  = 17, /// 16 pipes
+    ADDR_PIPECFG_P16_32x32_16x16 = 18,
+    ADDR_PIPECFG_MAX             = 19,
+} AddrPipeCfg;
+
+/**
+***************************************************************************************************
+* AddrTileType
+*
+*   @brief
+*       Neutral enums that specifies micro tile type (MICRO_TILE_MODE)
+***************************************************************************************************
+*/
+typedef enum _AddrTileType
+{
+    ADDR_DISPLAYABLE        = 0,    ///< Displayable tiling
+    ADDR_NON_DISPLAYABLE    = 1,    ///< Non-displayable tiling, a.k.a thin micro tiling
+    ADDR_DEPTH_SAMPLE_ORDER = 2,    ///< Same as non-displayable plus depth-sample-order
+    ADDR_ROTATED            = 3,    ///< Rotated displayable tiling
+    ADDR_THICK              = 4,    ///< Thick micro-tiling, only valid for THICK and XTHICK
+} AddrTileType;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Type definitions: short system-independent names for address library types
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__APPLE__)
+
+#ifndef BOOL_32        // no bool type in C
+/// @brief Boolean type, since none is defined in C
+/// @ingroup type
+#define BOOL_32 int
+#endif
+
+#ifndef INT_32
+#define INT_32  int
+#endif
+
+#ifndef UINT_32
+#define UINT_32 unsigned int
+#endif
+
+#ifndef INT_16
+#define INT_16  short
+#endif
+
+#ifndef UINT_16
+#define UINT_16 unsigned short
+#endif
+
+#ifndef INT_8
+#define INT_8   char
+#endif
+
+#ifndef UINT_8
+#define UINT_8  unsigned char
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+//
+//  64-bit integer types depend on the compiler
+//
+#if defined( __GNUC__ ) || defined( __WATCOMC__ )
+#define INT_64   long long
+#define UINT_64  unsigned long long
+
+#elif defined( _WIN32 )
+#define INT_64   __int64
+#define UINT_64  unsigned __int64
+
+#else
+#error Unsupported compiler and/or operating system for 64-bit integers
+
+/// @brief 64-bit signed integer type (compiler dependent)
+/// @ingroup type
+///
+/// The addrlib defines a 64-bit signed integer type for either
+/// Gnu/Watcom compilers (which use the first syntax) or for
+/// the Windows VCC compiler (which uses the second syntax).
+#define INT_64  long long OR __int64
+
+/// @brief 64-bit unsigned integer type (compiler dependent)
+/// @ingroup type
+///
+/// The addrlib defines a 64-bit unsigned integer type for either
+/// Gnu/Watcom compilers (which use the first syntax) or for
+/// the Windows VCC compiler (which uses the second syntax).
+///
+#define UINT_64  unsigned long long OR unsigned __int64
+#endif
+
+#endif // #if !defined(__APPLE__)
+
+//  ADDR64X is used to print addresses in hex form on both Windows and Linux
+//
+#if defined( __GNUC__ ) || defined( __WATCOMC__ )
+#define ADDR64X "llx"
+#define ADDR64D "lld"
+
+#elif defined( _WIN32 )
+#define ADDR64X "I64x"
+#define ADDR64D "I64d"
+
+#else
+#error Unsupported compiler and/or operating system for 64-bit integers
+
+/// @brief Addrlib device address 64-bit printf tag  (compiler dependent)
+/// @ingroup type
+///
+/// This allows printf to display an ADDR_64 for either the Windows VCC compiler
+/// (which used this value) or the Gnu/Watcom compilers (which use "llx".
+/// An example of use is printf("addr 0x%"ADDR64X"\n", address);
+///
+#define ADDR64X "llx" OR "I64x"
+#define ADDR64D "lld" OR "I64d"
+#endif
+
+
+/// @brief Union for storing a 32-bit float or 32-bit integer
+/// @ingroup type
+///
+/// This union provides a simple way to convert between a 32-bit float
+/// and a 32-bit integer. It also prevents the compiler from producing
+/// code that alters NaN values when assiging or coying floats.
+/// Therefore, all address library routines that pass or return 32-bit
+/// floating point data do so by passing or returning a FLT_32.
+///
+typedef union {
+    INT_32   i;
+    UINT_32  u;
+    float    f;
+} ADDR_FLT_32;
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Macros for controlling linking and building on multiple systems
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if defined(_MSC_VER)
+#if defined(va_copy)
+#undef va_copy  //redefine va_copy to support VC2013
+#endif
+#endif
+
+#if !defined(va_copy)
+#define va_copy(dst, src) \
+    ((void) memcpy(&(dst), &(src), sizeof(va_list)))
+#endif
+
+#endif // __ADDR_TYPES_H__
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrcommon.h b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrcommon.h
new file mode 100644
index 00000000000..f996c9a3402
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrcommon.h
@@ -0,0 +1,558 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrcommon.h
+* @brief Contains the helper function and constants
+***************************************************************************************************
+*/
+
+#ifndef __ADDR_COMMON_H__
+#define __ADDR_COMMON_H__
+
+#include "addrinterface.h"
+
+
+// ADDR_LNX_KERNEL_BUILD is for internal build
+// Moved from addrinterface.h so __KERNEL__ is not needed any more
+#if ADDR_LNX_KERNEL_BUILD // || (defined(__GNUC__) && defined(__KERNEL__))
+    #include "lnx_common_defs.h" // ported from cmmqs
+#elif !defined(__APPLE__)
+    #include <stdlib.h>
+    #include <string.h>
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Common constants
+///////////////////////////////////////////////////////////////////////////////////////////////////
+static const UINT_32 MicroTileWidth      = 8;       ///< Micro tile width, for 1D and 2D tiling
+static const UINT_32 MicroTileHeight     = 8;       ///< Micro tile height, for 1D and 2D tiling
+static const UINT_32 ThickTileThickness  = 4;       ///< Micro tile thickness, for THICK modes
+static const UINT_32 XThickTileThickness = 8;       ///< Extra thick tiling thickness
+static const UINT_32 PowerSaveTileBytes  = 64;      ///< Nuber of bytes per tile for power save 64
+static const UINT_32 CmaskCacheBits      = 1024;    ///< Number of bits for CMASK cache
+static const UINT_32 CmaskElemBits       = 4;       ///< Number of bits for CMASK element
+static const UINT_32 HtileCacheBits      = 16384;   ///< Number of bits for HTILE cache 512*32
+
+static const UINT_32 MicroTilePixels     = MicroTileWidth * MicroTileHeight;
+
+static const INT_32 TileIndexInvalid        = TILEINDEX_INVALID;
+static const INT_32 TileIndexLinearGeneral  = TILEINDEX_LINEAR_GENERAL;
+static const INT_32 TileIndexNoMacroIndex   = -3;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Common macros
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#define BITS_PER_BYTE 8
+#define BITS_TO_BYTES(x) ( ((x) + (BITS_PER_BYTE-1)) / BITS_PER_BYTE )
+#define BYTES_TO_BITS(x) ( (x) * BITS_PER_BYTE )
+
+/// Helper macros to select a single bit from an int (undefined later in section)
+#define _BIT(v,b)      (((v) >> (b) ) & 1)
+
+/**
+***************************************************************************************************
+* @brief Enums to identify AddrLib type
+***************************************************************************************************
+*/
+enum AddrLibClass
+{
+    BASE_ADDRLIB = 0x0,
+    R600_ADDRLIB = 0x6,
+    R800_ADDRLIB = 0x8,
+    SI_ADDRLIB   = 0xa,
+    CI_ADDRLIB   = 0xb,
+};
+
+/**
+***************************************************************************************************
+* AddrChipFamily
+*
+*   @brief
+*       Neutral enums that specifies chip family.
+*
+***************************************************************************************************
+*/
+enum AddrChipFamily
+{
+    ADDR_CHIP_FAMILY_IVLD,    ///< Invalid family
+    ADDR_CHIP_FAMILY_R6XX,
+    ADDR_CHIP_FAMILY_R7XX,
+    ADDR_CHIP_FAMILY_R8XX,
+    ADDR_CHIP_FAMILY_NI,
+    ADDR_CHIP_FAMILY_SI,
+    ADDR_CHIP_FAMILY_CI,
+    ADDR_CHIP_FAMILY_VI,
+};
+
+/**
+***************************************************************************************************
+* ADDR_CONFIG_FLAGS
+*
+*   @brief
+*       This structure is used to set addr configuration flags.
+***************************************************************************************************
+*/
+union ADDR_CONFIG_FLAGS
+{
+    struct
+    {
+        /// Clients do not need to set these flags except forceLinearAligned.
+        /// There flags are set up by AddrLib inside thru AddrInitGlobalParamsFromRegister
+        UINT_32 optimalBankSwap        : 1;    ///< New bank tiling for RV770 only
+        UINT_32 noCubeMipSlicesPad     : 1;    ///< Disables faces padding for cubemap mipmaps
+        UINT_32 fillSizeFields         : 1;    ///< If clients fill size fields in all input and
+                                               ///  output structure
+        UINT_32 ignoreTileInfo         : 1;    ///< Don't use tile info structure
+        UINT_32 useTileIndex           : 1;    ///< Make tileIndex field in input valid
+        UINT_32 useCombinedSwizzle     : 1;    ///< Use combined swizzle
+        UINT_32 checkLast2DLevel       : 1;    ///< Check the last 2D mip sub level
+        UINT_32 useHtileSliceAlign     : 1;    ///< Do htile single slice alignment
+        UINT_32 degradeBaseLevel       : 1;    ///< Degrade to 1D modes automatically for base level
+        UINT_32 allowLargeThickTile    : 1;    ///< Allow 64*thickness*bytesPerPixel > rowSize
+        UINT_32 reserved               : 22;   ///< Reserved bits for future use
+    };
+
+    UINT_32 value;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Platform specific debug break defines
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if DEBUG
+    #if defined(__GNUC__)
+        #define ADDR_DBG_BREAK()
+    #elif defined(__APPLE__)
+        #define ADDR_DBG_BREAK()    { IOPanic("");}
+    #else
+        #define ADDR_DBG_BREAK()    { __debugbreak(); }
+    #endif
+#else
+    #define ADDR_DBG_BREAK()
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Debug assertions used in AddrLib
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if DEBUG
+#define ADDR_ASSERT(__e) if ( !((__e) ? TRUE : FALSE)) { ADDR_DBG_BREAK(); }
+#define ADDR_ASSERT_ALWAYS() ADDR_DBG_BREAK()
+#define ADDR_UNHANDLED_CASE() ADDR_ASSERT(!"Unhandled case")
+#define ADDR_NOT_IMPLEMENTED() ADDR_ASSERT(!"Not implemented");
+#else //DEBUG
+#define ADDR_ASSERT(__e)
+#define ADDR_ASSERT_ALWAYS()
+#define ADDR_UNHANDLED_CASE()
+#define ADDR_NOT_IMPLEMENTED()
+#endif //DEBUG
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Debug print macro from legacy address library
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if DEBUG
+
+#define ADDR_PRNT(a)    AddrObject::DebugPrint a
+
+/// @brief Macro for reporting informational messages
+/// @ingroup util
+///
+/// This macro optionally prints an informational message to stdout.
+/// The first parameter is a condition -- if it is true, nothing is done.
+/// The second pararmeter MUST be a parenthesis-enclosed list of arguments,
+/// starting with a string. This is passed to printf() or an equivalent
+/// in order to format the informational message. For example,
+/// ADDR_INFO(0, ("test %d",3) ); prints out "test 3".
+///
+#define ADDR_INFO(cond, a)         \
+{ if (!(cond)) { ADDR_PRNT(a); } }
+
+
+/// @brief Macro for reporting error warning messages
+/// @ingroup util
+///
+/// This macro optionally prints an error warning message to stdout,
+/// followed by the file name and line number where the macro was called.
+/// The first parameter is a condition -- if it is true, nothing is done.
+/// The second pararmeter MUST be a parenthesis-enclosed list of arguments,
+/// starting with a string. This is passed to printf() or an equivalent
+/// in order to format the informational message. For example,
+/// ADDR_WARN(0, ("test %d",3) ); prints out "test 3" followed by
+/// a second line with the file name and line number.
+///
+#define ADDR_WARN(cond, a)         \
+{ if (!(cond))                     \
+  { ADDR_PRNT(a);                  \
+    ADDR_PRNT(("  WARNING in file %s, line %d\n", __FILE__, __LINE__)); \
+} }
+
+
+/// @brief Macro for reporting fatal error conditions
+/// @ingroup util
+///
+/// This macro optionally stops execution of the current routine
+/// after printing an error warning message to stdout,
+/// followed by the file name and line number where the macro was called.
+/// The first parameter is a condition -- if it is true, nothing is done.
+/// The second pararmeter MUST be a parenthesis-enclosed list of arguments,
+/// starting with a string. This is passed to printf() or an equivalent
+/// in order to format the informational message. For example,
+/// ADDR_EXIT(0, ("test %d",3) ); prints out "test 3" followed by
+/// a second line with the file name and line number, then stops execution.
+///
+#define ADDR_EXIT(cond, a)         \
+{ if (!(cond))                     \
+  { ADDR_PRNT(a); ADDR_DBG_BREAK();\
+} }
+
+#else // DEBUG
+
+#define ADDRDPF 1 ? (void)0 : (void)
+
+#define ADDR_PRNT(a)
+
+#define ADDR_DBG_BREAK()
+
+#define ADDR_INFO(cond, a)
+
+#define ADDR_WARN(cond, a)
+
+#define ADDR_EXIT(cond, a)
+
+#endif // DEBUG
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Misc helper functions
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrXorReduce
+*
+*   @brief
+*       Xor the right-side numberOfBits bits of x.
+***************************************************************************************************
+*/
+static inline UINT_32 XorReduce(
+    UINT_32 x,
+    UINT_32 numberOfBits)
+{
+    UINT_32 i;
+    UINT_32 result = x & 1;
+
+    for (i=1; i<numberOfBits; i++)
+    {
+        result ^= ((x>>i) & 1);
+    }
+
+    return result;
+}
+
+/**
+***************************************************************************************************
+*   IsPow2
+*
+*   @brief
+*       Check if the size (UINT_32) is pow 2
+***************************************************************************************************
+*/
+static inline UINT_32 IsPow2(
+    UINT_32 dim)        ///< [in] dimension of miplevel
+{
+    ADDR_ASSERT(dim > 0);
+    return !(dim & (dim - 1));
+}
+
+/**
+***************************************************************************************************
+*   IsPow2
+*
+*   @brief
+*       Check if the size (UINT_64) is pow 2
+***************************************************************************************************
+*/
+static inline UINT_64 IsPow2(
+    UINT_64 dim)        ///< [in] dimension of miplevel
+{
+    ADDR_ASSERT(dim > 0);
+    return !(dim & (dim - 1));
+}
+
+/**
+***************************************************************************************************
+*   ByteAlign
+*
+*   @brief
+*       Align UINT_32 "x" to "align" alignment, "align" should be power of 2
+***************************************************************************************************
+*/
+static inline UINT_32 PowTwoAlign(
+    UINT_32 x,
+    UINT_32 align)
+{
+    //
+    // Assert that x is a power of two.
+    //
+    ADDR_ASSERT(IsPow2(align));
+    return (x + (align - 1)) & (~(align - 1));
+}
+
+/**
+***************************************************************************************************
+*   ByteAlign
+*
+*   @brief
+*       Align UINT_64 "x" to "align" alignment, "align" should be power of 2
+***************************************************************************************************
+*/
+static inline UINT_64 PowTwoAlign(
+    UINT_64 x,
+    UINT_64 align)
+{
+    //
+    // Assert that x is a power of two.
+    //
+    ADDR_ASSERT(IsPow2(align));
+    return (x + (align - 1)) & (~(align - 1));
+}
+
+/**
+***************************************************************************************************
+*   Min
+*
+*   @brief
+*       Get the min value between two unsigned values
+***************************************************************************************************
+*/
+static inline UINT_32 Min(
+    UINT_32 value1,
+    UINT_32 value2)
+{
+    return ((value1 < (value2)) ? (value1) : value2);
+}
+
+/**
+***************************************************************************************************
+*   Min
+*
+*   @brief
+*       Get the min value between two signed values
+***************************************************************************************************
+*/
+static inline INT_32 Min(
+    INT_32 value1,
+    INT_32 value2)
+{
+    return ((value1 < (value2)) ? (value1) : value2);
+}
+
+/**
+***************************************************************************************************
+*   Max
+*
+*   @brief
+*       Get the max value between two unsigned values
+***************************************************************************************************
+*/
+static inline UINT_32 Max(
+    UINT_32 value1,
+    UINT_32 value2)
+{
+    return ((value1 > (value2)) ? (value1) : value2);
+}
+
+/**
+***************************************************************************************************
+*   Max
+*
+*   @brief
+*       Get the max value between two signed values
+***************************************************************************************************
+*/
+static inline INT_32 Max(
+    INT_32 value1,
+    INT_32 value2)
+{
+    return ((value1 > (value2)) ? (value1) : value2);
+}
+
+/**
+***************************************************************************************************
+*   NextPow2
+*
+*   @brief
+*       Compute the mipmap's next level dim size
+***************************************************************************************************
+*/
+static inline UINT_32 NextPow2(
+    UINT_32 dim)        ///< [in] dimension of miplevel
+{
+    UINT_32 newDim;
+
+    newDim = 1;
+
+    if (dim > 0x7fffffff)
+    {
+        ADDR_ASSERT_ALWAYS();
+        newDim = 0x80000000;
+    }
+    else
+    {
+        while (newDim < dim)
+        {
+            newDim <<= 1;
+        }
+    }
+
+    return newDim;
+}
+
+/**
+***************************************************************************************************
+*   Log2
+*
+*   @brief
+*       Compute log of base 2
+***************************************************************************************************
+*/
+static inline UINT_32 Log2(
+    UINT_32 x)      ///< [in] the value should calculate log based 2
+{
+    UINT_32 y;
+
+    //
+    // Assert that x is a power of two.
+    //
+    ADDR_ASSERT(IsPow2(x));
+
+    y = 0;
+    while (x > 1)
+    {
+        x >>= 1;
+        y++;
+    }
+
+    return y;
+}
+
+/**
+***************************************************************************************************
+*   QLog2
+*
+*   @brief
+*       Compute log of base 2 quickly (<= 16)
+***************************************************************************************************
+*/
+static inline UINT_32 QLog2(
+    UINT_32 x)      ///< [in] the value should calculate log based 2
+{
+    ADDR_ASSERT(x <= 16);
+
+    UINT_32 y = 0;
+
+    switch (x)
+    {
+        case 1:
+            y = 0;
+            break;
+        case 2:
+            y = 1;
+            break;
+        case 4:
+            y = 2;
+            break;
+        case 8:
+            y = 3;
+            break;
+        case 16:
+            y = 4;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+    }
+
+    return y;
+}
+
+/**
+***************************************************************************************************
+*   SafeAssign
+*
+*   @brief
+*       NULL pointer safe assignment
+***************************************************************************************************
+*/
+static inline VOID SafeAssign(
+    UINT_32*    pLVal,  ///< [in] Pointer to left val
+    UINT_32     rVal)   ///< [in] Right value
+{
+    if (pLVal)
+    {
+        *pLVal = rVal;
+    }
+}
+
+/**
+***************************************************************************************************
+*   SafeAssign
+*
+*   @brief
+*       NULL pointer safe assignment for 64bit values
+***************************************************************************************************
+*/
+static inline VOID SafeAssign(
+    UINT_64*    pLVal,  ///< [in] Pointer to left val
+    UINT_64     rVal)   ///< [in] Right value
+{
+    if (pLVal)
+    {
+        *pLVal = rVal;
+    }
+}
+
+/**
+***************************************************************************************************
+*   SafeAssign
+*
+*   @brief
+*       NULL pointer safe assignment for AddrTileMode
+***************************************************************************************************
+*/
+static inline VOID SafeAssign(
+    AddrTileMode*    pLVal, ///< [in] Pointer to left val
+    AddrTileMode     rVal)  ///< [in] Right value
+{
+    if (pLVal)
+    {
+        *pLVal = rVal;
+    }
+}
+
+#endif // __ADDR_COMMON_H__
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.cpp
new file mode 100644
index 00000000000..76b1badf958
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.cpp
@@ -0,0 +1,1674 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrelemlib.cpp
+* @brief Contains the class implementation for element/pixel related functions
+***************************************************************************************************
+*/
+
+#include "addrelemlib.h"
+#include "addrlib.h"
+
+
+/**
+***************************************************************************************************
+*   AddrElemLib::AddrElemLib
+*
+*   @brief
+*       constructor
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+AddrElemLib::AddrElemLib(
+    AddrLib* const pAddrLib) :  ///< [in] Parent addrlib instance pointer
+    AddrObject(pAddrLib->GetClient()),
+    m_pAddrLib(pAddrLib)
+{
+    switch (m_pAddrLib->GetAddrChipFamily())
+    {
+        case ADDR_CHIP_FAMILY_R6XX:
+            m_depthPlanarType = ADDR_DEPTH_PLANAR_R600;
+            m_fp16ExportNorm = 0;
+            break;
+        case ADDR_CHIP_FAMILY_R7XX:
+            m_depthPlanarType = ADDR_DEPTH_PLANAR_R600;
+            m_fp16ExportNorm = 1;
+            break;
+        case ADDR_CHIP_FAMILY_R8XX:
+        case ADDR_CHIP_FAMILY_NI: // Same as 8xx
+            m_depthPlanarType = ADDR_DEPTH_PLANAR_R800;
+            m_fp16ExportNorm = 1;
+            break;
+        default:
+            m_fp16ExportNorm = 1;
+            m_depthPlanarType = ADDR_DEPTH_PLANAR_R800;
+    }
+
+    m_configFlags.value = 0;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::~AddrElemLib
+*
+*   @brief
+*       destructor
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+AddrElemLib::~AddrElemLib()
+{
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::Create
+*
+*   @brief
+*       Creates and initializes AddrLib object.
+*
+*   @return
+*       Returns point to ADDR_CREATEINFO if successful.
+***************************************************************************************************
+*/
+AddrElemLib* AddrElemLib::Create(
+    const AddrLib* const        pAddrLib)   ///< [in] Pointer of parent AddrLib instance
+{
+    AddrElemLib* pElemLib = NULL;
+
+    if (pAddrLib)
+    {
+        pElemLib = new(pAddrLib->GetClient()) AddrElemLib(const_cast<AddrLib* const>(pAddrLib));
+    }
+
+    return pElemLib;
+}
+
+/**************************************************************************************************
+*   AddrElemLib::Flt32sToInt32s
+*
+*   @brief
+*       Convert a ADDR_FLT_32 value to Int32 value
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::Flt32sToInt32s(
+    ADDR_FLT_32     value,      ///< [in] ADDR_FLT_32 value
+    UINT_32         bits,       ///< [in] nubmer of bits in value
+    AddrNumberType  numberType, ///< [in] the type of number
+    UINT_32*        pResult)    ///< [out] Int32 value
+{
+    UINT_8 round = 128;    //ADDR_ROUND_BY_HALF
+    UINT_32 uscale;
+    UINT_32 sign;
+
+    //convert each component to an INT_32
+    switch ( numberType )
+    {
+        case ADDR_NO_NUMBER:    //fall through
+        case ADDR_ZERO:         //fall through
+        case ADDR_ONE:          //fall through
+        case ADDR_EPSILON:      //fall through
+            return;        // these are zero-bit components, so don't set result
+
+        case ADDR_UINT_BITS:            // unsigned integer bit field, clamped to range
+            uscale = (1<<bits) - 1;
+            if (bits == 32)               // special case unsigned 32-bit int
+            {
+                *pResult = value.i;
+            }
+            else
+            {
+                if ((value.i < 0) || (value.u > uscale))
+                {
+                    *pResult = uscale;
+                }
+                else
+                {
+                    *pResult = value.i;
+                }
+                return;
+            }
+
+        // The algorithm used in the DB and TX differs at one value for 24-bit unorms
+        case ADDR_UNORM_R6XXDB:        // unsigned repeating fraction
+            if ((bits==24) && (value.i == 0x33000000))
+            {
+                *pResult = 1;
+                return;
+            }              // Else treat like ADDR_UNORM_R6XX
+
+        case ADDR_UNORM_R6XX:            // unsigned repeating fraction
+            if (value.f <= 0)
+            {
+                *pResult = 0;            // first clamp to [0..1]
+            }
+            else
+            {
+                if (value.f >= 1)
+                {
+                     *pResult = (1<<bits) - 1;
+                }
+                else
+                {
+                    if ((value.i | 0x87FFFFFF) == 0xFFFFFFFF)
+                    {
+                        *pResult = 0;                        // NaN, so force to 0
+                    }
+
+                    #if 0 // floating point version for documentation
+                    else
+                    {
+                        FLOAT f = value.f * ((1<<bits) - 1);
+                        *pResult = static_cast<INT_32>(f + (round/256.0f));
+                    }
+                    #endif
+                    else
+                    {
+                        ADDR_FLT_32 scaled;
+                        ADDR_FLT_32 shifted;
+                        UINT_64 truncated, rounded;
+                        UINT_32 altShift;
+                        UINT_32 mask = (1 << bits) - 1;
+                        UINT_32 half = 1 << (bits - 1);
+                        UINT_32 mant24 = (value.i & 0x7FFFFF) + 0x800000;
+                        UINT_64 temp = mant24 - (mant24>>bits) -
+                            static_cast<INT_32>((mant24 & mask) > half);
+                        UINT_32 exp8 = value.i >> 23;
+                        UINT_32 shift = 126 - exp8 + 24 - bits;
+                        UINT_64 final;
+
+                        if (shift >= 32) // This is zero, even with maximum dither add
+                        {
+                            final = 0;
+                        }
+                        else
+                        {
+                            final = ((temp<<8) + (static_cast<UINT_64>(round)<<shift)) >> (shift+8);
+                        }
+                        //ADDR_EXIT( *pResult == final,
+                        //    ("Float %x converted to %d-bit Unorm %x != bitwise %x",
+                        //     value.u, bits, (UINT_32)*pResult, (UINT_32)final) );
+                        if (final > mask)
+                        {
+                            final = mask;
+                        }
+
+                        scaled.f  = value.f * ((1<<bits) - 1);
+                        shifted.f = (scaled.f * 256);
+                        truncated = ((shifted.i&0x7FFFFF) + (INT_64)0x800000) << 8;
+                        altShift  = 126 + 24 + 8 - ((shifted.i>>23)&0xFF);
+                        truncated = (altShift > 60) ? 0 : truncated >> altShift;
+                        rounded   = static_cast<INT_32>((round + truncated) >> 8);
+                        //if (rounded > ((1<<bits) - 1))
+                        //    rounded = ((1<<bits) - 1);
+                        *pResult = static_cast<INT_32>(rounded); //(INT_32)final;
+                    }
+                }
+            }
+
+            return;
+
+        case ADDR_S8FLOAT32:    // 32-bit IEEE float, passes through NaN values
+            *pResult = value.i;
+            return;
+
+        // @@ FIX ROUNDING in this code, fix the denorm case
+        case ADDR_U4FLOATC:         // Unsigned float, 4-bit exponent. bias 15, clamped [0..1]
+            sign = (value.i >> 31) & 1;
+            if ((value.i&0x7F800000) == 0x7F800000)    // If NaN or INF:
+            {
+                if ((value.i&0x007FFFFF) != 0)             // then if NaN
+                {
+                    *pResult = 0;                       // return 0
+                }
+                else
+                {
+                    *pResult = (sign)?0:0xF00000;           // else +INF->+1, -INF->0
+                }
+                return;
+            }
+            if (value.f <= 0)
+            {
+                *pResult = 0;
+            }
+            else
+            {
+                if (value.f>=1)
+                {
+                    *pResult = 0xF << (bits-4);
+                }
+                else
+                {
+                    if ((value.i>>23) > 112 )
+                    {
+                        // 24-bit float: normalized
+                        // value.i += 1 << (22-bits+4);
+                        // round the IEEE mantissa to mantissa size
+                        // @@ NOTE: add code to support rounding
+                        value.u &= 0x7FFFFFF;             // mask off high 4 exponent bits
+                        *pResult = value.i >> (23-bits+4);// shift off unused mantissa bits
+                    }
+                    else
+                    {
+                        // 24-bit float: denormalized
+                        value.f = value.f / (1<<28) / (1<<28);
+                        value.f = value.f / (1<<28) / (1<<28);    // convert to IEEE denorm
+                        // value.i += 1 << (22-bits+4);
+                        // round the IEEE mantissa to mantissa size
+                        // @@ NOTE: add code to support rounding
+                        *pResult = value.i >> (23-bits+4);    // shift off unused mantissa bits
+                    }
+                }
+            }
+
+            return;
+
+        default:                    // invalid number mode
+            //ADDR_EXIT(0, ("Invalid AddrNumber %d", numberType) );
+            break;
+
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::Int32sToPixel
+*
+*   @brief
+*       Pack 32-bit integer values into an uncompressed pixel,
+*       in the proper order
+*
+*   @return
+*       N/A
+*
+*   @note
+*       This entry point packes four 32-bit integer values into
+*       an uncompressed pixel. The pixel values are specifies in
+*       standard order, e.g. depth/stencil. This routine asserts
+*       if called on compressed pixel.
+***************************************************************************************************
+*/
+VOID AddrElemLib::Int32sToPixel(
+    UINT_32              numComps,      ///< [in] number of components
+    UINT_32*             pComps,        ///< [in] compnents
+    UINT_32*             pCompBits,     ///< [in] total bits in each component
+    UINT_32*             pCompStart,    ///< [in] the first bit position of each component
+    ADDR_COMPONENT_FLAGS properties,    ///< [in] properties about byteAligned, exportNorm
+    UINT_32              resultBits,    ///< [in] result bits: total bpp after decompression
+    UINT_8*              pPixel)        ///< [out] a depth/stencil pixel value
+{
+    UINT_32 i;
+    UINT_32 j;
+    UINT_32 start;
+    UINT_32 size;
+    UINT_32 byte;
+    UINT_32 value = 0;
+    UINT_32 compMask;
+    UINT_32 elemMask=0;
+    UINT_32 elementXor = 0;  // address xor when reading bytes from elements
+
+
+    // @@ NOTE: assert if called on a compressed format!
+
+    if (properties.byteAligned)    // Components are all byte-sized
+    {
+        for (i = 0; i < numComps; i++)        // Then for each component
+        {
+            // Copy the bytes of the component into the element
+            start = pCompStart[i] / 8;
+            size  = pCompBits[i]  / 8;
+            for (j = 0; j < size; j++)
+            {
+                pPixel[(j+start)^elementXor] = static_cast<UINT_8>(pComps[i] >> (8*j));
+            }
+        }
+    }
+    else                        // Element is 32-bits or less, components are bit fields
+    {
+        // First, extract each component in turn and combine it into a 32-bit value
+        for (i = 0; i < numComps; i++)
+        {
+            compMask = (1 << pCompBits[i]) - 1;
+            elemMask |= compMask << pCompStart[i];
+            value |= (pComps[i] & compMask) << pCompStart[i];
+        }
+
+        // Mext, copy the masked value into the element
+        size = (resultBits + 7) / 8;
+        for (i = 0; i < size; i++)
+        {
+            byte = pPixel[i^elementXor] & ~(elemMask >> (8*i));
+            pPixel[i^elementXor] = static_cast<UINT_8>(byte | ((elemMask & value) >> (8*i)));
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   Flt32ToDepthPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a depth/stencil pixel value
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::Flt32ToDepthPixel(
+    AddrDepthFormat     format,     ///< [in] Depth format
+    const ADDR_FLT_32   comps[2],   ///< [in] two components of depth
+    UINT_8*             pPixel      ///< [out] depth pixel value
+    ) const
+{
+    UINT_32 i;
+    UINT_32 values[2];
+    ADDR_COMPONENT_FLAGS properties;    // byteAligned, exportNorm
+    UINT_32 resultBits = 0;             // result bits: total bits per pixel after decompression
+
+    ADDR_PIXEL_FORMATINFO fmt;
+
+    // get type for each component
+    PixGetDepthCompInfo(format, &fmt);
+
+    //initialize properties
+    properties.byteAligned = TRUE;
+    properties.exportNorm  = TRUE;
+    properties.floatComp   = FALSE;
+
+    //set properties and result bits
+    for (i = 0; i < 2; i++)
+    {
+        if ((fmt.compBit[i] & 7) || (fmt.compStart[i] & 7))
+        {
+            properties.byteAligned = FALSE;
+        }
+
+        if (resultBits < fmt.compStart[i] + fmt.compBit[i])
+        {
+            resultBits = fmt.compStart[i] + fmt.compBit[i];
+        }
+
+        // Clear ADDR_EXPORT_NORM if can't be represented as 11-bit or smaller [-1..+1] format
+        if (fmt.compBit[i] > 11 || fmt.numType[i] >= ADDR_USCALED)
+        {
+            properties.exportNorm = FALSE;
+        }
+
+        // Mark if there are any floating point components
+        if ((fmt.numType[i] == ADDR_U4FLOATC) || (fmt.numType[i] >= ADDR_S8FLOAT) )
+        {
+            properties.floatComp = TRUE;
+        }
+    }
+
+    // Convert the two input floats to integer values
+    for (i = 0; i < 2; i++)
+    {
+        Flt32sToInt32s(comps[i], fmt.compBit[i], fmt.numType[i], &values[i]);
+    }
+
+    // Then pack the two integer components, in the proper order
+    Int32sToPixel(2, values, fmt.compBit, fmt.compStart, properties, resultBits, pPixel );
+
+}
+
+/**
+***************************************************************************************************
+*   Flt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a red/green/blue/alpha pixel value
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::Flt32ToColorPixel(
+    AddrColorFormat     format,     ///< [in] Color format
+    AddrSurfaceNumber   surfNum,    ///< [in] Surface number
+    AddrSurfaceSwap     surfSwap,   ///< [in] Surface swap
+    const ADDR_FLT_32   comps[4],   ///< [in] four components of color
+    UINT_8*             pPixel      ///< [out] a red/green/blue/alpha pixel value
+    ) const
+{
+    ADDR_PIXEL_FORMATINFO pixelInfo;
+
+    UINT_32 i;
+    UINT_32 values[4];
+    ADDR_COMPONENT_FLAGS properties;    // byteAligned, exportNorm
+    UINT_32 resultBits = 0;             // result bits: total bits per pixel after decompression
+
+    memset(&pixelInfo, 0, sizeof(ADDR_PIXEL_FORMATINFO));
+
+    PixGetColorCompInfo(format, surfNum, surfSwap, &pixelInfo);
+
+    //initialize properties
+    properties.byteAligned = TRUE;
+    properties.exportNorm  = TRUE;
+    properties.floatComp   = FALSE;
+
+    //set properties and result bits
+    for (i = 0; i < 4; i++)
+    {
+        if ( (pixelInfo.compBit[i] & 7) || (pixelInfo.compStart[i] & 7) )
+        {
+            properties.byteAligned = FALSE;
+        }
+
+        if (resultBits < pixelInfo.compStart[i] + pixelInfo.compBit[i])
+        {
+            resultBits = pixelInfo.compStart[i] + pixelInfo.compBit[i];
+        }
+
+        if (m_fp16ExportNorm)
+        {
+            // Clear ADDR_EXPORT_NORM if can't be represented as 11-bit or smaller [-1..+1] format
+            // or if it's not FP and <=16 bits
+            if (((pixelInfo.compBit[i] > 11) || (pixelInfo.numType[i] >= ADDR_USCALED))
+                && (pixelInfo.numType[i] !=ADDR_U4FLOATC))
+            {
+                properties.exportNorm = FALSE;
+            }
+        }
+        else
+        {
+            // Clear ADDR_EXPORT_NORM if can't be represented as 11-bit or smaller [-1..+1] format
+            if (pixelInfo.compBit[i] > 11 || pixelInfo.numType[i] >= ADDR_USCALED)
+            {
+                properties.exportNorm = FALSE;
+            }
+        }
+
+        // Mark if there are any floating point components
+        if ( (pixelInfo.numType[i] == ADDR_U4FLOATC) ||
+             (pixelInfo.numType[i] >= ADDR_S8FLOAT) )
+        {
+            properties.floatComp = TRUE;
+        }
+    }
+
+    // Convert the four input floats to integer values
+    for (i = 0; i < 4; i++)
+    {
+        Flt32sToInt32s(comps[i], pixelInfo.compBit[i], pixelInfo.numType[i], &values[i]);
+    }
+
+    // Then pack the four integer components, in the proper order
+    Int32sToPixel(4, values, &pixelInfo.compBit[0], &pixelInfo.compStart[0],
+                  properties, resultBits, pPixel);
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompType
+*
+*   @brief
+*       Fill per component info
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::GetCompType(
+    AddrColorFormat         format,     ///< [in] surface format
+    AddrSurfaceNumber       numType,  ///< [in] number type
+    ADDR_PIXEL_FORMATINFO*  pInfo)       ///< [in][out] per component info out
+{
+    BOOL_32 handled = FALSE;
+
+    // Floating point formats override the number format
+    switch (format)
+    {
+        case ADDR_COLOR_16_FLOAT:            // fall through for all pure floating point format
+        case ADDR_COLOR_16_16_FLOAT:
+        case ADDR_COLOR_16_16_16_16_FLOAT:
+        case ADDR_COLOR_32_FLOAT:
+        case ADDR_COLOR_32_32_FLOAT:
+        case ADDR_COLOR_32_32_32_32_FLOAT:
+        case ADDR_COLOR_10_11_11_FLOAT:
+        case ADDR_COLOR_11_11_10_FLOAT:
+            numType = ADDR_NUMBER_FLOAT;
+            break;
+            // Special handling for the depth formats
+        case ADDR_COLOR_8_24:                // fall through for these 2 similar format
+        case ADDR_COLOR_24_8:
+            for (UINT_32 c = 0; c < 4; c++)
+            {
+                if (pInfo->compBit[c] == 8)
+                {
+                    pInfo->numType[c] = ADDR_UINT_BITS;
+                }
+                else if (pInfo->compBit[c]  == 24)
+                {
+                    pInfo->numType[c] = ADDR_UNORM_R6XX;
+                }
+                else
+                {
+                    pInfo->numType[c] = ADDR_NO_NUMBER;
+                }
+            }
+            handled = TRUE;
+            break;
+        case ADDR_COLOR_8_24_FLOAT:          // fall through for these 3 similar format
+        case ADDR_COLOR_24_8_FLOAT:
+        case ADDR_COLOR_X24_8_32_FLOAT:
+            for (UINT_32 c = 0; c < 4; c++)
+            {
+                if (pInfo->compBit[c] == 8)
+                {
+                    pInfo->numType[c] = ADDR_UINT_BITS;
+                }
+                else if (pInfo->compBit[c] == 24)
+                {
+                    pInfo->numType[c] = ADDR_U4FLOATC;
+                }
+                else if (pInfo->compBit[c] == 32)
+                {
+                    pInfo->numType[c] = ADDR_S8FLOAT32;
+                }
+                else
+                {
+                    pInfo->numType[c] = ADDR_NO_NUMBER;
+                }
+            }
+            handled = TRUE;
+            break;
+        default:
+            break;
+    }
+
+    if (!handled)
+    {
+        for (UINT_32 c = 0; c < 4; c++)
+        {
+            // Assign a number type for each component
+            AddrSurfaceNumber cnum;
+
+            // First handle default component values
+            if (pInfo->compBit[c] == 0)
+            {
+                if (c < 3)
+                {
+                    pInfo->numType[c] = ADDR_ZERO;      // Default is zero for RGB
+                }
+                else if (numType == ADDR_NUMBER_UINT || numType == ADDR_NUMBER_SINT)
+                {
+                    pInfo->numType[c] = ADDR_EPSILON;   // Alpha INT_32 bits default is 0x01
+                }
+                else
+                {
+                    pInfo->numType[c] = ADDR_ONE;       // Alpha normal default is float 1.0
+                }
+                continue;
+            }
+            // Now handle small components
+            else if (pInfo->compBit[c] == 1)
+            {
+                if (numType == ADDR_NUMBER_UINT || numType == ADDR_NUMBER_SINT)
+                {
+                    cnum = ADDR_NUMBER_UINT;
+                }
+                else
+                {
+                    cnum = ADDR_NUMBER_UNORM;
+                }
+            }
+            else
+            {
+                cnum = numType;
+            }
+
+            // If no default, set the number type fom num, compbits, and architecture
+            switch (cnum)
+            {
+                case ADDR_NUMBER_SRGB:
+                    pInfo->numType[c] = (c < 3) ? ADDR_GAMMA8_R6XX : ADDR_UNORM_R6XX;
+                    break;
+                case ADDR_NUMBER_UNORM:
+                    pInfo->numType[c] = ADDR_UNORM_R6XX;
+                    break;
+                case ADDR_NUMBER_SNORM:
+                    pInfo->numType[c] = ADDR_SNORM_R6XX;
+                    break;
+                case ADDR_NUMBER_USCALED:
+                    pInfo->numType[c] = ADDR_USCALED;  // @@ Do we need separate Pele routine?
+                    break;
+                case ADDR_NUMBER_SSCALED:
+                    pInfo->numType[c] = ADDR_SSCALED;  // @@ Do we need separate Pele routine?
+                    break;
+                case ADDR_NUMBER_FLOAT:
+                    if (pInfo->compBit[c] == 32)
+                    {
+                        pInfo->numType[c] = ADDR_S8FLOAT32;
+                    }
+                    else if (pInfo->compBit[c] == 16)
+                    {
+                        pInfo->numType[c] = ADDR_S5FLOAT;
+                    }
+                    else if (pInfo->compBit[c] >= 10)
+                    {
+                        pInfo->numType[c] = ADDR_U5FLOAT;
+                    }
+                    else
+                    {
+                        ADDR_ASSERT_ALWAYS();
+                    }
+                    break;
+                case ADDR_NUMBER_SINT:
+                    pInfo->numType[c] = ADDR_SINT_BITS;
+                    break;
+                case ADDR_NUMBER_UINT:
+                    pInfo->numType[c] = ADDR_UINT_BITS;
+                    break;
+
+                default:
+                    ADDR_ASSERT(!"Invalid number type");
+                    pInfo->numType[c] = ADDR_NO_NUMBER;
+                    break;
+             }
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompSwap
+*
+*   @brief
+*       Get components swapped for color surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::GetCompSwap(
+    AddrSurfaceSwap         swap,   ///< [in] swap mode
+    ADDR_PIXEL_FORMATINFO*  pInfo)  ///< [in/out] output per component info
+{
+    switch (pInfo->comps)
+    {
+        case 4:
+            switch (swap)
+            {
+                case ADDR_SWAP_ALT:
+                    SwapComps( 0, 2, pInfo );
+                    break;    // BGRA
+                case ADDR_SWAP_STD_REV:
+                    SwapComps( 0, 3, pInfo );
+                    SwapComps( 1, 2, pInfo );
+                    break;    // ABGR
+                case ADDR_SWAP_ALT_REV:
+                    SwapComps( 0, 3, pInfo );
+                    SwapComps( 0, 2, pInfo );
+                    SwapComps( 0, 1, pInfo );
+                    break;    // ARGB
+                default:
+                    break;
+            }
+            break;
+        case 3:
+            switch (swap)
+            {
+                case ADDR_SWAP_ALT_REV:
+                    SwapComps( 0, 3, pInfo );
+                    SwapComps( 0, 2, pInfo );
+                    break;    // AGR
+                case ADDR_SWAP_STD_REV:
+                    SwapComps( 0, 2, pInfo );
+                    break;    // BGR
+                case ADDR_SWAP_ALT:
+                    SwapComps( 2, 3, pInfo );
+                    break;    // RGA
+                default:
+                    break;    // RGB
+            }
+            break;
+        case 2:
+            switch (swap)
+            {
+                case ADDR_SWAP_ALT_REV:
+                    SwapComps( 0, 1, pInfo );
+                    SwapComps( 1, 3, pInfo );
+                    break;    // AR
+                case ADDR_SWAP_STD_REV:
+                    SwapComps( 0, 1, pInfo );
+                    break;    // GR
+                case ADDR_SWAP_ALT:
+                    SwapComps( 1, 3, pInfo );
+                    break;    // RA
+                default:
+                    break;    // RG
+            }
+            break;
+        case 1:
+            switch (swap)
+            {
+                case ADDR_SWAP_ALT_REV:
+                    SwapComps( 0, 3, pInfo );
+                    break;    // A
+                case ADDR_SWAP_STD_REV:
+                    SwapComps( 0, 2, pInfo );
+                    break;    // B
+                case ADDR_SWAP_ALT:
+                    SwapComps( 0, 1, pInfo );
+                    break;    // G
+                default:
+                    break;    // R
+            }
+            break;
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompSwap
+*
+*   @brief
+*       Get components swapped for color surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::SwapComps(
+    UINT_32                 c0,     ///< [in] component index 0
+    UINT_32                 c1,     ///< [in] component index 1
+    ADDR_PIXEL_FORMATINFO*  pInfo)  ///< [in/out] output per component info
+{
+    UINT_32 start;
+    UINT_32 bits;
+
+    start = pInfo->compStart[c0];
+    pInfo->compStart[c0] = pInfo->compStart[c1];
+    pInfo->compStart[c1] = start;
+
+    bits  = pInfo->compBit[c0];
+    pInfo->compBit[c0] = pInfo->compBit[c1];
+    pInfo->compBit[c1] = bits;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::PixGetColorCompInfo
+*
+*   @brief
+*       Get per component info for color surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::PixGetColorCompInfo(
+    AddrColorFormat         format, ///< [in] surface format, read from register
+    AddrSurfaceNumber       number, ///< [in] pixel number type
+    AddrSurfaceSwap         swap,   ///< [in] component swap mode
+    ADDR_PIXEL_FORMATINFO*  pInfo   ///< [out] output per component info
+    ) const
+{
+    // 1. Get componet bits
+    switch (format)
+    {
+        case ADDR_COLOR_8:
+            GetCompBits(8, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_1_5_5_5:
+            GetCompBits(5, 5, 5, 1, pInfo);
+            break;
+        case ADDR_COLOR_5_6_5:
+            GetCompBits(8, 6, 5, 0, pInfo);
+            break;
+        case ADDR_COLOR_6_5_5:
+            GetCompBits(5, 5, 6, 0, pInfo);
+            break;
+        case ADDR_COLOR_8_8:
+            GetCompBits(8, 8, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_4_4_4_4:
+            GetCompBits(4, 4, 4, 4, pInfo);
+            break;
+        case ADDR_COLOR_16:
+            GetCompBits(16, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_8_8_8_8:
+            GetCompBits(8, 8, 8, 8, pInfo);
+            break;
+        case ADDR_COLOR_2_10_10_10:
+            GetCompBits(10, 10, 10, 2, pInfo);
+            break;
+        case ADDR_COLOR_10_11_11:
+            GetCompBits(11, 11, 10, 0, pInfo);
+            break;
+        case ADDR_COLOR_11_11_10:
+            GetCompBits(10, 11, 11, 0, pInfo);
+            break;
+        case ADDR_COLOR_16_16:
+            GetCompBits(16, 16, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_16_16_16_16:
+            GetCompBits(16, 16, 16, 16, pInfo);
+            break;
+        case ADDR_COLOR_16_FLOAT:
+            GetCompBits(16, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_16_16_FLOAT:
+            GetCompBits(16, 16, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_32_FLOAT:
+            GetCompBits(32, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_32_32_FLOAT:
+            GetCompBits(32, 32, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_16_16_16_16_FLOAT:
+            GetCompBits(16, 16, 16, 16, pInfo);
+            break;
+        case ADDR_COLOR_32_32_32_32_FLOAT:
+            GetCompBits(32, 32, 32, 32, pInfo);
+            break;
+
+        case ADDR_COLOR_32:
+            GetCompBits(32, 0, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_32_32:
+            GetCompBits(32, 32, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_32_32_32_32:
+            GetCompBits(32, 32, 32, 32, pInfo);
+            break;
+        case ADDR_COLOR_10_10_10_2:
+            GetCompBits(2, 10, 10, 10, pInfo);
+            break;
+        case ADDR_COLOR_10_11_11_FLOAT:
+            GetCompBits(11, 11, 10, 0, pInfo);
+            break;
+        case ADDR_COLOR_11_11_10_FLOAT:
+            GetCompBits(10, 11, 11, 0, pInfo);
+            break;
+        case ADDR_COLOR_5_5_5_1:
+            GetCompBits(1, 5, 5, 5, pInfo);
+            break;
+        case ADDR_COLOR_3_3_2:
+            GetCompBits(2, 3, 3, 0, pInfo);
+            break;
+        case ADDR_COLOR_4_4:
+            GetCompBits(4, 4, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_8_24:
+        case ADDR_COLOR_8_24_FLOAT:  // same bit count, fall through
+            GetCompBits(24, 8, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_24_8:
+        case ADDR_COLOR_24_8_FLOAT:  // same bit count, fall through
+            GetCompBits(8, 24, 0, 0, pInfo);
+            break;
+        case ADDR_COLOR_X24_8_32_FLOAT:
+            GetCompBits(32, 8, 0, 0, pInfo);
+            break;
+
+        case ADDR_COLOR_INVALID:
+            GetCompBits(0, 0, 0, 0, pInfo);
+            break;
+        default:
+            ADDR_ASSERT(0);
+            GetCompBits(0, 0, 0, 0, pInfo);
+            break;
+    }
+
+    // 2. Get component number type
+
+    GetCompType(format, number, pInfo);
+
+    // 3. Swap components if needed
+
+    GetCompSwap(swap, pInfo);
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::PixGetDepthCompInfo
+*
+*   @brief
+*       Get per component info for depth surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrElemLib::PixGetDepthCompInfo(
+    AddrDepthFormat         format,     ///< [in] surface format, read from register
+    ADDR_PIXEL_FORMATINFO*  pInfo       ///< [out] output per component bits and type
+    ) const
+{
+    if (m_depthPlanarType == ADDR_DEPTH_PLANAR_R800)
+    {
+        if (format == ADDR_DEPTH_8_24_FLOAT)
+        {
+            format = ADDR_DEPTH_X24_8_32_FLOAT; // Use this format to represent R800's D24FS8
+        }
+
+        if (format == ADDR_DEPTH_X8_24_FLOAT)
+        {
+            format = ADDR_DEPTH_32_FLOAT;
+        }
+    }
+
+    switch (format)
+    {
+        case ADDR_DEPTH_16:
+            GetCompBits(16, 0, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_8_24:
+        case ADDR_DEPTH_8_24_FLOAT:      // similar format, fall through
+            GetCompBits(24, 8, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_X8_24:
+        case ADDR_DEPTH_X8_24_FLOAT:     // similar format, fall through
+            GetCompBits(24, 0, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_32_FLOAT:
+            GetCompBits(32, 0, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_X24_8_32_FLOAT:
+            GetCompBits(32, 8, 0, 0, pInfo);
+            break;
+        case ADDR_DEPTH_INVALID:
+            GetCompBits(0, 0, 0, 0, pInfo);
+            break;
+        default:
+            ADDR_ASSERT(0);
+            GetCompBits(0, 0, 0, 0, pInfo);
+            break;
+    }
+
+    switch (format)
+    {
+        case ADDR_DEPTH_16:
+            pInfo->numType [0] = ADDR_UNORM_R6XX;
+            pInfo->numType [1] = ADDR_ZERO;
+            break;
+        case ADDR_DEPTH_8_24:
+            pInfo->numType [0] = ADDR_UNORM_R6XXDB;
+            pInfo->numType [1] = ADDR_UINT_BITS;
+            break;
+        case ADDR_DEPTH_8_24_FLOAT:
+            pInfo->numType [0] = ADDR_U4FLOATC;
+            pInfo->numType [1] = ADDR_UINT_BITS;
+            break;
+        case ADDR_DEPTH_X8_24:
+            pInfo->numType [0] = ADDR_UNORM_R6XXDB;
+            pInfo->numType [1] = ADDR_ZERO;
+            break;
+        case ADDR_DEPTH_X8_24_FLOAT:
+            pInfo->numType [0] = ADDR_U4FLOATC;
+            pInfo->numType [1] = ADDR_ZERO;
+            break;
+        case ADDR_DEPTH_32_FLOAT:
+            pInfo->numType [0] = ADDR_S8FLOAT32;
+            pInfo->numType [1] = ADDR_ZERO;
+            break;
+        case ADDR_DEPTH_X24_8_32_FLOAT:
+            pInfo->numType [0] = ADDR_S8FLOAT32;
+            pInfo->numType [1] = ADDR_UINT_BITS;
+            break;
+        default:
+            pInfo->numType [0] = ADDR_NO_NUMBER;
+            pInfo->numType [1] = ADDR_NO_NUMBER;
+            break;
+    }
+
+    pInfo->numType [2] = ADDR_NO_NUMBER;
+    pInfo->numType [3] = ADDR_NO_NUMBER;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::PixGetExportNorm
+*
+*   @brief
+*       Check if fp16 export norm can be enabled.
+*
+*   @return
+*       TRUE if this can be enabled.
+*
+***************************************************************************************************
+*/
+BOOL_32 AddrElemLib::PixGetExportNorm(
+    AddrColorFormat     colorFmt,       ///< [in] surface format, read from register
+    AddrSurfaceNumber   numberFmt,      ///< [in] pixel number type
+    AddrSurfaceSwap     swap            ///< [in] components swap type
+    ) const
+{
+    BOOL_32 enabled = TRUE;
+
+    ADDR_PIXEL_FORMATINFO formatInfo;
+
+    PixGetColorCompInfo(colorFmt, numberFmt, swap, &formatInfo);
+
+    for (UINT_32 c = 0; c < 4; c++)
+    {
+        if (m_fp16ExportNorm)
+        {
+            if (((formatInfo.compBit[c] > 11) || (formatInfo.numType[c] > ADDR_USCALED)) &&
+                (formatInfo.numType[c] != ADDR_U4FLOATC)    &&
+                (formatInfo.numType[c] != ADDR_S5FLOAT)     &&
+                (formatInfo.numType[c] != ADDR_S5FLOATM)    &&
+                (formatInfo.numType[c] != ADDR_U5FLOAT)     &&
+                (formatInfo.numType[c] != ADDR_U3FLOATM))
+            {
+                enabled = FALSE;
+                break;
+            }
+        }
+        else
+        {
+            if ((formatInfo.compBit[c] > 11) || (formatInfo.numType[c] > ADDR_USCALED))
+            {
+                enabled = FALSE;
+                break;
+            }
+        }
+    }
+
+    return enabled;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::AdjustSurfaceInfo
+*
+*   @brief
+*       Adjust bpp/base pitch/width/height according to elemMode and expandX/Y
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::AdjustSurfaceInfo(
+    AddrElemMode    elemMode,       ///< [in] element mode
+    UINT_32         expandX,        ///< [in] decompression expansion factor in X
+    UINT_32         expandY,        ///< [in] decompression expansion factor in Y
+    UINT_32*        pBpp,           ///< [in/out] bpp
+    UINT_32*        pBasePitch,     ///< [in/out] base pitch
+    UINT_32*        pWidth,         ///< [in/out] width
+    UINT_32*        pHeight)        ///< [in/out] height
+{
+    UINT_32 packedBits;
+    UINT_32 basePitch;
+    UINT_32 width;
+    UINT_32 height;
+    UINT_32 bpp;
+    BOOL_32 bBCnFormat = FALSE;
+
+    ADDR_ASSERT(pBpp != NULL);
+    ADDR_ASSERT(pWidth != NULL && pHeight != NULL && pBasePitch != NULL);
+
+    if (pBpp)
+    {
+        bpp = *pBpp;
+
+        switch (elemMode)
+        {
+            case ADDR_EXPANDED:
+                packedBits = bpp / expandX / expandY;
+                break;
+            case ADDR_PACKED_STD: // Different bit order
+            case ADDR_PACKED_REV:
+                packedBits = bpp * expandX * expandY;
+                break;
+            case ADDR_PACKED_GBGR:
+            case ADDR_PACKED_BGRG:
+                packedBits = bpp; // 32-bit packed ==> 2 32-bit result
+                break;
+            case ADDR_PACKED_BC1: // Fall through
+            case ADDR_PACKED_BC4:
+                packedBits = 64;
+                bBCnFormat = TRUE;
+                break;
+            case ADDR_PACKED_BC2: // Fall through
+            case ADDR_PACKED_BC3: // Fall through
+            case ADDR_PACKED_BC5: // Fall through
+                bBCnFormat = TRUE;
+                packedBits = 128;
+                break;
+            case ADDR_ROUND_BY_HALF:  // Fall through
+            case ADDR_ROUND_TRUNCATE: // Fall through
+            case ADDR_ROUND_DITHER:   // Fall through
+            case ADDR_UNCOMPRESSED:
+                packedBits = bpp;
+                break;
+            default:
+                packedBits = bpp;
+                ADDR_ASSERT_ALWAYS();
+                break;
+        }
+
+        *pBpp = packedBits;
+    }
+
+    if (pWidth && pHeight && pBasePitch)
+    {
+        basePitch = *pBasePitch;
+        width     = *pWidth;
+        height    = *pHeight;
+
+        if ((expandX > 1) || (expandY > 1))
+        {
+            if (elemMode == ADDR_EXPANDED)
+            {
+                basePitch *= expandX;
+                width     *= expandX;
+                height    *= expandY;
+            }
+            else
+            {
+                // Evergreen family workaround
+                if (bBCnFormat && (m_pAddrLib->GetAddrChipFamily() == ADDR_CHIP_FAMILY_R8XX))
+                {
+                    // For BCn we now pad it to POW2 at the beginning so it is safe to
+                    // divide by 4 directly
+                    basePitch = basePitch / expandX;
+                    width     = width  / expandX;
+                    height    = height / expandY;
+#if DEBUG
+                    width     = (width == 0) ? 1 : width;
+                    height    = (height == 0) ? 1 : height;
+
+                    if ((*pWidth > PowTwoAlign(width, 8) * expandX) ||
+                        (*pHeight > PowTwoAlign(height, 8) * expandY)) // 8 is 1D tiling alignment
+                    {
+                        // if this assertion is hit we may have issues if app samples
+                        // rightmost/bottommost pixels
+                        ADDR_ASSERT_ALWAYS();
+                    }
+#endif
+                }
+                else // Not BCn format we still keep old way (FMT_1? No real test yet)
+                {
+                    basePitch = (basePitch + expandX - 1) / expandX;
+                    width     = (width + expandX - 1) / expandX;
+                    height    = (height + expandY - 1) / expandY;
+                }
+            }
+
+            *pBasePitch = basePitch; // 0 is legal value for base pitch.
+            *pWidth     = (width == 0) ? 1 : width;
+            *pHeight    = (height == 0) ? 1 : height;
+        } //if (pWidth && pHeight && pBasePitch)
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::RestoreSurfaceInfo
+*
+*   @brief
+*       Reverse operation of AdjustSurfaceInfo
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::RestoreSurfaceInfo(
+    AddrElemMode    elemMode,       ///< [in] element mode
+    UINT_32         expandX,        ///< [in] decompression expansion factor in X
+    UINT_32         expandY,        ///< [out] decompression expansion factor in Y
+    UINT_32*        pBpp,           ///< [in/out] bpp
+    UINT_32*        pWidth,         ///< [in/out] width
+    UINT_32*        pHeight)        ///< [in/out] height
+{
+    UINT_32 originalBits;
+    UINT_32 width;
+    UINT_32 height;
+    UINT_32 bpp;
+
+    ADDR_ASSERT(pBpp != NULL);
+    ADDR_ASSERT(pWidth != NULL && pHeight != NULL);
+
+    if (pBpp)
+    {
+        bpp = *pBpp;
+
+        switch (elemMode)
+        {
+        case ADDR_EXPANDED:
+            originalBits = bpp * expandX * expandY;
+            break;
+        case ADDR_PACKED_STD: // Different bit order
+        case ADDR_PACKED_REV:
+            originalBits = bpp / expandX / expandY;
+            break;
+        case ADDR_PACKED_GBGR:
+        case ADDR_PACKED_BGRG:
+            originalBits = bpp; // 32-bit packed ==> 2 32-bit result
+            break;
+        case ADDR_PACKED_BC1: // Fall through
+        case ADDR_PACKED_BC4:
+            originalBits = 64;
+            break;
+        case ADDR_PACKED_BC2: // Fall through
+        case ADDR_PACKED_BC3: // Fall through
+            case ADDR_PACKED_BC5:
+            originalBits = 128;
+            break;
+        case ADDR_ROUND_BY_HALF:  // Fall through
+        case ADDR_ROUND_TRUNCATE: // Fall through
+        case ADDR_ROUND_DITHER:   // Fall through
+        case ADDR_UNCOMPRESSED:
+            originalBits = bpp;
+            break;
+        default:
+            originalBits = bpp;
+            ADDR_ASSERT_ALWAYS();
+            break;
+        }
+
+        *pBpp = originalBits;
+    }
+
+    if (pWidth && pHeight)
+    {
+        width    = *pWidth;
+        height   = *pHeight;
+
+        if ((expandX > 1) || (expandY > 1))
+        {
+            if (elemMode == ADDR_EXPANDED)
+            {
+                width /= expandX;
+                height /= expandY;
+            }
+            else
+            {
+                width *= expandX;
+                height *= expandY;
+            }
+        }
+
+        *pWidth  = (width == 0) ? 1 : width;
+        *pHeight = (height == 0) ? 1 : height;
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetBitsPerPixel
+*
+*   @brief
+*       Compute the total bits per element according to a format
+*       code. For compressed formats, this is not the same as
+*       the number of bits per decompressed element.
+*
+*   @return
+*       Bits per pixel
+***************************************************************************************************
+*/
+UINT_32 AddrElemLib::GetBitsPerPixel(
+    AddrFormat          format,         ///< [in] surface format code
+    AddrElemMode*       pElemMode,      ///< [out] element mode
+    UINT_32*            pExpandX,       ///< [out] decompression expansion factor in X
+    UINT_32*            pExpandY,       ///< [out] decompression expansion factor in Y
+    UINT_32*            pUnusedBits)    ///< [out] bits unused
+{
+    UINT_32 bpp;
+    UINT_32 expandX = 1;
+    UINT_32 expandY = 1;
+    UINT_32 bitUnused = 0;
+    AddrElemMode elemMode = ADDR_UNCOMPRESSED; // default value
+
+    switch (format)
+    {
+        case ADDR_FMT_8:
+            bpp = 8;
+            break;
+        case ADDR_FMT_1_5_5_5:
+        case ADDR_FMT_5_6_5:
+        case ADDR_FMT_6_5_5:
+        case ADDR_FMT_8_8:
+        case ADDR_FMT_4_4_4_4:
+        case ADDR_FMT_16:
+        case ADDR_FMT_16_FLOAT:
+            bpp = 16;
+            break;
+        case ADDR_FMT_GB_GR: // treat as FMT_8_8
+            elemMode = ADDR_PACKED_GBGR;
+            bpp = 16;
+            break;
+        case ADDR_FMT_BG_RG: // treat as FMT_8_8
+            elemMode = ADDR_PACKED_BGRG;
+            bpp = 16;
+            break;
+        case ADDR_FMT_8_8_8_8:
+        case ADDR_FMT_2_10_10_10:
+        case ADDR_FMT_10_11_11:
+        case ADDR_FMT_11_11_10:
+        case ADDR_FMT_16_16:
+        case ADDR_FMT_16_16_FLOAT:
+        case ADDR_FMT_32:
+        case ADDR_FMT_32_FLOAT:
+        case ADDR_FMT_24_8:
+        case ADDR_FMT_24_8_FLOAT:
+            bpp = 32;
+            break;
+        case ADDR_FMT_16_16_16_16:
+        case ADDR_FMT_16_16_16_16_FLOAT:
+        case ADDR_FMT_32_32:
+        case ADDR_FMT_32_32_FLOAT:
+        case ADDR_FMT_CTX1:
+            bpp = 64;
+            break;
+        case ADDR_FMT_32_32_32_32:
+        case ADDR_FMT_32_32_32_32_FLOAT:
+            bpp = 128;
+            break;
+        case ADDR_FMT_INVALID:
+            bpp = 0;
+            break;
+        case ADDR_FMT_1_REVERSED:
+            elemMode = ADDR_PACKED_REV;
+            expandX = 8;
+            bpp = 1;
+            break;
+        case ADDR_FMT_1:
+            elemMode = ADDR_PACKED_STD;
+            expandX = 8;
+            bpp = 1;
+            break;
+        case ADDR_FMT_4_4:
+        case ADDR_FMT_3_3_2:
+            bpp = 8;
+            break;
+        case ADDR_FMT_5_5_5_1:
+            bpp = 16;
+            break;
+        case ADDR_FMT_32_AS_8:
+        case ADDR_FMT_32_AS_8_8:
+        case ADDR_FMT_8_24:
+        case ADDR_FMT_8_24_FLOAT:
+        case ADDR_FMT_10_10_10_2:
+        case ADDR_FMT_10_11_11_FLOAT:
+        case ADDR_FMT_11_11_10_FLOAT:
+        case ADDR_FMT_5_9_9_9_SHAREDEXP:
+            bpp = 32;
+            break;
+        case ADDR_FMT_X24_8_32_FLOAT:
+            bpp = 64;
+            bitUnused = 24;
+            break;
+        case ADDR_FMT_8_8_8:
+            elemMode = ADDR_EXPANDED;
+            bpp = 24;//@@ 8;      // read 3 elements per pixel
+            expandX = 3;
+            break;
+        case ADDR_FMT_16_16_16:
+        case ADDR_FMT_16_16_16_FLOAT:
+            elemMode = ADDR_EXPANDED;
+            bpp = 48;//@@ 16;      // read 3 elements per pixel
+            expandX = 3;
+            break;
+        case ADDR_FMT_32_32_32_FLOAT:
+        case ADDR_FMT_32_32_32:
+            elemMode = ADDR_EXPANDED;
+            expandX = 3;
+            bpp = 96;//@@ 32;      // read 3 elements per pixel
+            break;
+        case ADDR_FMT_BC1:
+            elemMode = ADDR_PACKED_BC1;
+            expandX = 4;
+            expandY = 4;
+            bpp = 64;
+            break;
+        case ADDR_FMT_BC4:
+            elemMode = ADDR_PACKED_BC4;
+            expandX = 4;
+            expandY = 4;
+            bpp = 64;
+            break;
+        case ADDR_FMT_BC2:
+            elemMode = ADDR_PACKED_BC2;
+            expandX = 4;
+            expandY = 4;
+            bpp = 128;
+            break;
+        case ADDR_FMT_BC3:
+            elemMode = ADDR_PACKED_BC3;
+            expandX = 4;
+            expandY = 4;
+            bpp = 128;
+            break;
+        case ADDR_FMT_BC5:
+        case ADDR_FMT_BC6: // reuse ADDR_PACKED_BC5
+        case ADDR_FMT_BC7: // reuse ADDR_PACKED_BC5
+            elemMode = ADDR_PACKED_BC5;
+            expandX = 4;
+            expandY = 4;
+            bpp = 128;
+            break;
+        default:
+            bpp = 0;
+            ADDR_ASSERT_ALWAYS();
+            break;
+            // @@ or should this be an error?
+    }
+
+    SafeAssign(pExpandX, expandX);
+    SafeAssign(pExpandY, expandY);
+    SafeAssign(pUnusedBits, bitUnused);
+    SafeAssign(reinterpret_cast<UINT_32*>(pElemMode), elemMode);
+
+    return bpp;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompBits
+*
+*   @brief
+*       Set each component's bit size and bit start. And set element mode and number type
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::GetCompBits(
+    UINT_32 c0,                     ///< [in] bits of component 0
+    UINT_32 c1,                     ///< [in] bits of component 1
+    UINT_32 c2,                     ///< [in] bits of component 2
+    UINT_32 c3,                     ///< [in] bits of component 3
+    ADDR_PIXEL_FORMATINFO* pInfo,   ///< [out] per component info out
+    AddrElemMode elemMode)          ///< [in] element mode
+{
+    pInfo->comps = 0;
+
+    pInfo->compBit[0] = c0;
+    pInfo->compBit[1] = c1;
+    pInfo->compBit[2] = c2;
+    pInfo->compBit[3] = c3;
+
+    pInfo->compStart[0] = 0;
+    pInfo->compStart[1] = c0;
+    pInfo->compStart[2] = c0+c1;
+    pInfo->compStart[3] = c0+c1+c2;
+
+    pInfo->elemMode = elemMode;
+    // still needed since component swap may depend on number of components
+    for (INT i=0; i<4; i++)
+    {
+        if (pInfo->compBit[i] == 0)
+        {
+            pInfo->compStart[i]  = 0;       // all null components start at bit 0
+            pInfo->numType[i] = ADDR_NO_NUMBER; // and have no number type
+        }
+        else
+        {
+            pInfo->comps++;
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::GetCompBits
+*
+*   @brief
+*       Set the clear color (or clear depth/stencil) for a surface
+*
+*   @note
+*       If clearColor is zero, a default clear value is used in place of comps[4].
+*       If float32 is set, full precision is used, else the mantissa is reduced to 12-bits
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrElemLib::SetClearComps(
+    ADDR_FLT_32 comps[4],   ///< [in/out] components
+    BOOL_32 clearColor,     ///< [in] TRUE if clear color is set (CLEAR_COLOR)
+    BOOL_32 float32)        ///< [in] TRUE if float32 component (BLEND_FLOAT32)
+{
+    INT_32 i;
+
+    // Use default clearvalues if clearColor is disabled
+    if (clearColor == FALSE)
+    {
+        for (i=0; i<3; i++)
+        {
+            comps[i].f = 0.0;
+        }
+        comps[3].f = 1.0;
+    }
+
+    // Otherwise use the (modified) clear value
+    else
+    {
+        for (i=0; i<4; i++)
+        {   // If full precision, use clear value unchanged
+            if (float32)
+            {
+                // Do nothing
+                //comps[i] = comps[i];
+            }
+            // Else if it is a NaN, use the standard NaN value
+            else if ((comps[i].u & 0x7FFFFFFF) > 0x7F800000)
+            {
+                comps[i].u = 0xFFC00000;
+            }
+            // Else reduce the mantissa precision
+            else
+            {
+                comps[i].u = comps[i].u & 0xFFFFF000;
+            }
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::IsBlockCompressed
+*
+*   @brief
+*       TRUE if this is block compressed format
+*
+*   @note
+*
+*   @return
+*       BOOL_32
+***************************************************************************************************
+*/
+BOOL_32 AddrElemLib::IsBlockCompressed(
+    AddrFormat format)  ///< [in] Format
+{
+    return format >= ADDR_FMT_BC1 && format <= ADDR_FMT_BC7;
+}
+
+
+/**
+***************************************************************************************************
+*   AddrElemLib::IsCompressed
+*
+*   @brief
+*       TRUE if this is block compressed format or 1 bit format
+*
+*   @note
+*
+*   @return
+*       BOOL_32
+***************************************************************************************************
+*/
+BOOL_32 AddrElemLib::IsCompressed(
+    AddrFormat format)  ///< [in] Format
+{
+    return IsBlockCompressed(format) || format == ADDR_FMT_BC1 || format == ADDR_FMT_BC7;
+}
+
+/**
+***************************************************************************************************
+*   AddrElemLib::IsExpand3x
+*
+*   @brief
+*       TRUE if this is 3x expand format
+*
+*   @note
+*
+*   @return
+*       BOOL_32
+***************************************************************************************************
+*/
+BOOL_32 AddrElemLib::IsExpand3x(
+    AddrFormat format)  ///< [in] Format
+{
+    BOOL_32 is3x = FALSE;
+
+    switch (format)
+    {
+        case ADDR_FMT_8_8_8:
+        case ADDR_FMT_16_16_16:
+        case ADDR_FMT_16_16_16_FLOAT:
+        case ADDR_FMT_32_32_32:
+        case ADDR_FMT_32_32_32_FLOAT:
+            is3x = TRUE;
+            break;
+        default:
+            break;
+    }
+
+    return is3x;
+}
+
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.h
new file mode 100644
index 00000000000..c302b3b1788
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrelemlib.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrelemlib.h
+* @brief Contains the class for element/pixel related functions
+***************************************************************************************************
+*/
+
+#ifndef __ELEM_LIB_H__
+#define __ELEM_LIB_H__
+
+#include "addrinterface.h"
+#include "addrobject.h"
+#include "addrcommon.h"
+
+class AddrLib;
+
+// The masks for property bits within the Properties INT_32
+union ADDR_COMPONENT_FLAGS
+{
+    struct
+    {
+        UINT_32 byteAligned    : 1;    ///< all components are byte aligned
+        UINT_32 exportNorm     : 1;    ///< components support R6xx NORM compression
+        UINT_32 floatComp      : 1;    ///< there is at least one floating point component
+    };
+
+    UINT_32 value;
+};
+
+// Copy from legacy lib's AddrNumberType
+enum AddrNumberType
+{
+    // The following number types have the range [-1..1]
+    ADDR_NO_NUMBER,         // This component doesn't exist and has no default value
+    ADDR_EPSILON,           // Force component value to integer 0x00000001
+    ADDR_ZERO,              // Force component value to integer 0x00000000
+    ADDR_ONE,               // Force component value to floating point 1.0
+    // Above values don't have any bits per component (keep ADDR_ONE the last of these)
+
+    ADDR_UNORM,             // Unsigned normalized (repeating fraction) full precision
+    ADDR_SNORM,             // Signed normalized (repeating fraction) full precision
+    ADDR_GAMMA,             // Gamma-corrected, full precision
+
+    ADDR_UNORM_R5XXRB,      // Unsigned normalized (repeating fraction) for r5xx RB
+    ADDR_SNORM_R5XXRB,      // Signed normalized (repeating fraction) for r5xx RB
+    ADDR_GAMMA_R5XXRB,      // Gamma-corrected for r5xx RB (note: unnormalized value)
+    ADDR_UNORM_R5XXBC,      // Unsigned normalized (repeating fraction) for r5xx BC
+    ADDR_SNORM_R5XXBC,      // Signed normalized (repeating fraction) for r5xx BC
+    ADDR_GAMMA_R5XXBC,      // Gamma-corrected for r5xx BC (note: unnormalized value)
+
+    ADDR_UNORM_R6XX,        // Unsigned normalized (repeating fraction) for R6xx
+    ADDR_UNORM_R6XXDB,      // Unorms for 24-bit depth: one value differs from ADDR_UNORM_R6XX
+    ADDR_SNORM_R6XX,        // Signed normalized (repeating fraction) for R6xx
+    ADDR_GAMMA8_R6XX,       // Gamma-corrected for r6xx
+    ADDR_GAMMA8_R7XX_TP,    // Gamma-corrected for r7xx TP 12bit unorm 8.4.
+
+    ADDR_U4FLOATC,          // Unsigned float: 4-bit exponent, bias=15, no NaN, clamp [0..1]
+    ADDR_GAMMA_4SEG,        // Gamma-corrected, four segment approximation
+    ADDR_U0FIXED,           // Unsigned 0.N-bit fixed point
+
+    // The following number types have large ranges (LEAVE ADDR_USCALED first or fix Finish routine)
+    ADDR_USCALED,           // Unsigned integer converted to/from floating point
+    ADDR_SSCALED,           // Signed integer converted to/from floating point
+    ADDR_USCALED_R5XXRB,    // Unsigned integer to/from floating point for r5xx RB
+    ADDR_SSCALED_R5XXRB,    // Signed integer to/from floating point for r5xx RB
+    ADDR_UINT_BITS,         // Keep in unsigned integer form, clamped to specified range
+    ADDR_SINT_BITS,         // Keep in signed integer form, clamped to specified range
+    ADDR_UINTBITS,          // @@ remove Keep in unsigned integer form, use modulus to reduce bits
+    ADDR_SINTBITS,          // @@ remove Keep in signed integer form, use modulus to reduce bits
+
+    // The following number types and ADDR_U4FLOATC have exponents
+    // (LEAVE ADDR_S8FLOAT first or fix Finish routine)
+    ADDR_S8FLOAT,           // Signed floating point with 8-bit exponent, bias=127
+    ADDR_S8FLOAT32,         // 32-bit IEEE float, passes through NaN values
+    ADDR_S5FLOAT,           // Signed floating point with 5-bit exponent, bias=15
+    ADDR_S5FLOATM,          // Signed floating point with 5-bit exponent, bias=15, no NaN/Inf
+    ADDR_U5FLOAT,           // Signed floating point with 5-bit exponent, bias=15
+    ADDR_U3FLOATM,          // Unsigned floating point with 3-bit exponent, bias=3
+
+    ADDR_S5FIXED,           // Signed 5.N-bit fixed point, with rounding
+
+    ADDR_END_NUMBER         // Used for range comparisons
+};
+
+// Copy from legacy lib's AddrElement
+enum AddrElemMode
+{
+    // These formats allow both packing an unpacking
+    ADDR_ROUND_BY_HALF,     // add 1/2 and truncate when packing this element
+    ADDR_ROUND_TRUNCATE,    // truncate toward 0 for sign/mag, else toward neg
+    ADDR_ROUND_DITHER,      // Pack by dithering -- requires (x,y) position
+
+    // These formats only allow unpacking, no packing
+    ADDR_UNCOMPRESSED,      // Elements are not compressed: one data element per pixel/texel
+    ADDR_EXPANDED,          // Elements are split up and stored in multiple data elements
+    ADDR_PACKED_STD,        // Elements are compressed into ExpandX by ExpandY data elements
+    ADDR_PACKED_REV,        // Like ADDR_PACKED, but X order of pixels is reverved
+    ADDR_PACKED_GBGR,       // Elements are compressed 4:2:2 in G1B_G0R order (high to low)
+    ADDR_PACKED_BGRG,       // Elements are compressed 4:2:2 in BG1_RG0 order (high to low)
+    ADDR_PACKED_BC1,        // Each data element is uncompressed to a 4x4 pixel/texel array
+    ADDR_PACKED_BC2,        // Each data element is uncompressed to a 4x4 pixel/texel array
+    ADDR_PACKED_BC3,        // Each data element is uncompressed to a 4x4 pixel/texel array
+    ADDR_PACKED_BC4,        // Each data element is uncompressed to a 4x4 pixel/texel array
+    ADDR_PACKED_BC5,        // Each data element is uncompressed to a 4x4 pixel/texel array
+
+    // These formats provide various kinds of compression
+    ADDR_ZPLANE_R5XX,       // Compressed Zplane using r5xx architecture format
+    ADDR_ZPLANE_R6XX,       // Compressed Zplane using r6xx architecture format
+    //@@ Fill in the compression modes
+
+    ADDR_END_ELEMENT        // Used for range comparisons
+};
+
+enum AddrDepthPlanarType
+{
+    ADDR_DEPTH_PLANAR_NONE = 0, // No plane z/stencl
+    ADDR_DEPTH_PLANAR_R600 = 1, // R600 z and stencil planes are store within a tile
+    ADDR_DEPTH_PLANAR_R800 = 2, // R800 has separate z and stencil planes
+};
+
+/**
+***************************************************************************************************
+*   ADDR_PIXEL_FORMATINFO
+*
+*   @brief
+*       Per component info
+*
+***************************************************************************************************
+*/
+struct ADDR_PIXEL_FORMATINFO
+{
+    UINT_32             compBit[4];
+    AddrNumberType      numType[4];
+    UINT_32             compStart[4];
+    AddrElemMode        elemMode;
+    UINT_32             comps;          ///< Number of components
+};
+
+/**
+***************************************************************************************************
+* @brief This class contains asic indepentent element related attributes and operations
+***************************************************************************************************
+*/
+class AddrElemLib : public AddrObject
+{
+protected:
+    AddrElemLib(AddrLib* const pAddrLib);
+
+public:
+
+    /// Makes this class virtual
+    virtual ~AddrElemLib();
+
+    static AddrElemLib *Create(
+        const AddrLib* const pAddrLib);
+
+    /// The implementation is only for R6xx/R7xx, so make it virtual in case we need for R8xx
+    BOOL_32 PixGetExportNorm(
+        AddrColorFormat colorFmt,
+        AddrSurfaceNumber numberFmt, AddrSurfaceSwap swap) const;
+
+    /// Below method are asic independent, so make them just static.
+    /// Remove static if we need different operation in hwl.
+
+    VOID    Flt32ToDepthPixel(
+        AddrDepthFormat format, const ADDR_FLT_32 comps[2], UINT_8 *pPixel) const;
+
+    VOID    Flt32ToColorPixel(
+        AddrColorFormat format, AddrSurfaceNumber surfNum, AddrSurfaceSwap surfSwap,
+        const ADDR_FLT_32 comps[4], UINT_8 *pPixel) const;
+
+    static VOID    Flt32sToInt32s(
+        ADDR_FLT_32 value, UINT_32 bits, AddrNumberType numberType, UINT_32* pResult);
+
+    static VOID    Int32sToPixel(
+        UINT_32 numComps, UINT_32* pComps, UINT_32* pCompBits, UINT_32* pCompStart,
+        ADDR_COMPONENT_FLAGS properties, UINT_32 resultBits, UINT_8* pPixel);
+
+    VOID    PixGetColorCompInfo(
+        AddrColorFormat format, AddrSurfaceNumber number, AddrSurfaceSwap swap,
+        ADDR_PIXEL_FORMATINFO* pInfo) const;
+
+    VOID    PixGetDepthCompInfo(
+        AddrDepthFormat format, ADDR_PIXEL_FORMATINFO* pInfo) const;
+
+    UINT_32 GetBitsPerPixel(
+        AddrFormat format, AddrElemMode* pElemMode,
+        UINT_32* pExpandX = NULL, UINT_32* pExpandY = NULL, UINT_32* pBitsUnused = NULL);
+
+    static VOID    SetClearComps(
+        ADDR_FLT_32 comps[4], BOOL_32 clearColor, BOOL_32 float32);
+
+    VOID    AdjustSurfaceInfo(
+        AddrElemMode elemMode, UINT_32 expandX, UINT_32 expandY,
+        UINT_32* pBpp, UINT_32* pBasePitch, UINT_32* pWidth, UINT_32* pHeight);
+
+    VOID    RestoreSurfaceInfo(
+        AddrElemMode elemMode, UINT_32 expandX, UINT_32 expandY,
+        UINT_32* pBpp, UINT_32* pWidth, UINT_32* pHeight);
+
+    /// Checks if depth and stencil are planar inside a tile
+    BOOL_32 IsDepthStencilTilePlanar()
+    {
+        return (m_depthPlanarType == ADDR_DEPTH_PLANAR_R600) ? TRUE : FALSE;
+    }
+
+    /// Sets m_configFlags, copied from AddrLib
+    VOID    SetConfigFlags(ADDR_CONFIG_FLAGS flags)
+    {
+        m_configFlags = flags;
+    }
+
+    static BOOL_32 IsCompressed(AddrFormat format);
+    static BOOL_32 IsBlockCompressed(AddrFormat format);
+    static BOOL_32 IsExpand3x(AddrFormat format);
+
+protected:
+
+    static VOID    GetCompBits(
+        UINT_32 c0, UINT_32 c1, UINT_32 c2, UINT_32 c3,
+        ADDR_PIXEL_FORMATINFO* pInfo,
+        AddrElemMode elemMode = ADDR_ROUND_BY_HALF);
+
+    static VOID    GetCompType(
+        AddrColorFormat format, AddrSurfaceNumber numType,
+        ADDR_PIXEL_FORMATINFO* pInfo);
+
+    static VOID    GetCompSwap(
+        AddrSurfaceSwap swap, ADDR_PIXEL_FORMATINFO* pInfo);
+
+    static VOID    SwapComps(
+        UINT_32 c0, UINT_32 c1, ADDR_PIXEL_FORMATINFO* pInfo);
+
+private:
+
+    UINT_32             m_fp16ExportNorm;   ///< If allow FP16 to be reported as EXPORT_NORM
+    AddrDepthPlanarType m_depthPlanarType;
+
+    ADDR_CONFIG_FLAGS   m_configFlags;      ///< Copy of AddrLib's configFlags
+    AddrLib* const      m_pAddrLib;         ///< Pointer to parent addrlib instance
+};
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.cpp
new file mode 100644
index 00000000000..1df693e5be5
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.cpp
@@ -0,0 +1,4023 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrlib.cpp
+* @brief Contains the implementation for the AddrLib base class..
+***************************************************************************************************
+*/
+
+#include "addrinterface.h"
+#include "addrlib.h"
+#include "addrcommon.h"
+
+#if defined(__APPLE__)
+
+UINT_32 div64_32(UINT_64 n, UINT_32 base)
+{
+    UINT_64 rem = n;
+    UINT_64 b = base;
+    UINT_64 res, d = 1;
+    UINT_32 high = rem >> 32;
+
+    res = 0;
+    if (high >= base)
+    {
+        high /= base;
+        res = (UINT_64) high << 32;
+        rem -= (UINT_64) (high*base) << 32;
+    }
+
+    while ((INT_64)b > 0 && b < rem)
+    {
+        b = b+b;
+        d = d+d;
+    }
+
+    do
+    {
+        if (rem >= b)
+        {
+            rem -= b;
+            res += d;
+        }
+        b >>= 1;
+        d >>= 1;
+    } while (d);
+
+    n = res;
+    return rem;
+}
+
+extern "C"
+UINT_32 __umoddi3(UINT_64 n, UINT_32 base)
+{
+    return div64_32(n, base);
+}
+
+#endif // __APPLE__
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Static Const Member
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+const AddrTileModeFlags AddrLib::m_modeFlags[ADDR_TM_COUNT] =
+{// T   L  1  2  3  P  Pr B
+    {1, 1, 0, 0, 0, 0, 0, 0}, // ADDR_TM_LINEAR_GENERAL
+    {1, 1, 0, 0, 0, 0, 0, 0}, // ADDR_TM_LINEAR_ALIGNED
+    {1, 0, 1, 0, 0, 0, 0, 0}, // ADDR_TM_1D_TILED_THIN1
+    {4, 0, 1, 0, 0, 0, 0, 0}, // ADDR_TM_1D_TILED_THICK
+    {1, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_THIN1
+    {1, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_THIN2
+    {1, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_THIN4
+    {4, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_THICK
+    {1, 0, 0, 1, 0, 0, 0, 1}, // ADDR_TM_2B_TILED_THIN1
+    {1, 0, 0, 1, 0, 0, 0, 1}, // ADDR_TM_2B_TILED_THIN2
+    {1, 0, 0, 1, 0, 0, 0, 1}, // ADDR_TM_2B_TILED_THIN4
+    {4, 0, 0, 1, 0, 0, 0, 1}, // ADDR_TM_2B_TILED_THICK
+    {1, 0, 0, 1, 1, 0, 0, 0}, // ADDR_TM_3D_TILED_THIN1
+    {4, 0, 0, 1, 1, 0, 0, 0}, // ADDR_TM_3D_TILED_THICK
+    {1, 0, 0, 1, 1, 0, 0, 1}, // ADDR_TM_3B_TILED_THIN1
+    {4, 0, 0, 1, 1, 0, 0, 1}, // ADDR_TM_3B_TILED_THICK
+    {8, 0, 0, 1, 0, 0, 0, 0}, // ADDR_TM_2D_TILED_XTHICK
+    {8, 0, 0, 1, 1, 0, 0, 0}, // ADDR_TM_3D_TILED_XTHICK
+    {1, 0, 0, 0, 0, 0, 0, 0}, // ADDR_TM_POWER_SAVE
+    {1, 0, 0, 1, 0, 1, 1, 0}, // ADDR_TM_PRT_TILED_THIN1
+    {1, 0, 0, 1, 0, 1, 0, 0}, // ADDR_TM_PRT_2D_TILED_THIN1
+    {1, 0, 0, 1, 1, 1, 0, 0}, // ADDR_TM_PRT_3D_TILED_THIN1
+    {4, 0, 0, 1, 0, 1, 1, 0}, // ADDR_TM_PRT_TILED_THICK
+    {4, 0, 0, 1, 0, 1, 0, 0}, // ADDR_TM_PRT_2D_TILED_THICK
+    {4, 0, 0, 1, 1, 1, 0, 0}, // ADDR_TM_PRT_3D_TILED_THICK
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Constructor/Destructor
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrLib::AddrLib
+*
+*   @brief
+*       Constructor for the AddrLib class
+*
+***************************************************************************************************
+*/
+AddrLib::AddrLib() :
+    m_class(BASE_ADDRLIB),
+    m_chipFamily(ADDR_CHIP_FAMILY_IVLD),
+    m_chipRevision(0),
+    m_version(ADDRLIB_VERSION),
+    m_pipes(0),
+    m_banks(0),
+    m_pipeInterleaveBytes(0),
+    m_rowSize(0),
+    m_minPitchAlignPixels(1),
+    m_maxSamples(8),
+    m_pElemLib(NULL)
+{
+    m_configFlags.value = 0;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::AddrLib
+*
+*   @brief
+*       Constructor for the AddrLib class with hClient as parameter
+*
+***************************************************************************************************
+*/
+AddrLib::AddrLib(const AddrClient* pClient) :
+    AddrObject(pClient),
+    m_class(BASE_ADDRLIB),
+    m_chipFamily(ADDR_CHIP_FAMILY_IVLD),
+    m_chipRevision(0),
+    m_version(ADDRLIB_VERSION),
+    m_pipes(0),
+    m_banks(0),
+    m_pipeInterleaveBytes(0),
+    m_rowSize(0),
+    m_minPitchAlignPixels(1),
+    m_maxSamples(8),
+    m_pElemLib(NULL)
+{
+    m_configFlags.value = 0;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::~AddrLib
+*
+*   @brief
+*       Destructor for the AddrLib class
+*
+***************************************************************************************************
+*/
+AddrLib::~AddrLib()
+{
+    if (m_pElemLib)
+    {
+        delete m_pElemLib;
+    }
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Initialization/Helper
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrLib::Create
+*
+*   @brief
+*       Creates and initializes AddrLib object.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::Create(
+    const ADDR_CREATE_INPUT* pCreateIn,     ///< [in] pointer to ADDR_CREATE_INPUT
+    ADDR_CREATE_OUTPUT*      pCreateOut)    ///< [out] pointer to ADDR_CREATE_OUTPUT
+{
+    AddrLib* pLib = NULL;
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (pCreateIn->createFlags.fillSizeFields == TRUE)
+    {
+        if ((pCreateIn->size != sizeof(ADDR_CREATE_INPUT)) ||
+            (pCreateOut->size != sizeof(ADDR_CREATE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if ((returnCode == ADDR_OK)                    &&
+        (pCreateIn->callbacks.allocSysMem != NULL) &&
+        (pCreateIn->callbacks.freeSysMem != NULL))
+    {
+        AddrClient client = {
+            pCreateIn->hClient,
+            pCreateIn->callbacks
+        };
+
+        switch (pCreateIn->chipEngine)
+        {
+            case CIASICIDGFXENGINE_SOUTHERNISLAND:
+                switch (pCreateIn->chipFamily)
+                {
+                    case FAMILY_SI:
+                        pLib = AddrSIHwlInit(&client);
+                        break;
+                    case FAMILY_VI:
+                    case FAMILY_CZ: // VI based fusion(carrizo)
+                    case FAMILY_CI:
+                    case FAMILY_KV: // CI based fusion
+                        pLib = AddrCIHwlInit(&client);
+                        break;
+                    default:
+                        ADDR_ASSERT_ALWAYS();
+                        break;
+                }
+                break;
+            default:
+                ADDR_ASSERT_ALWAYS();
+                break;
+        }
+    }
+
+    if ((pLib != NULL))
+    {
+        BOOL_32 initValid;
+
+        // Pass createFlags to configFlags first since these flags may be overwritten
+        pLib->m_configFlags.noCubeMipSlicesPad  = pCreateIn->createFlags.noCubeMipSlicesPad;
+        pLib->m_configFlags.fillSizeFields      = pCreateIn->createFlags.fillSizeFields;
+        pLib->m_configFlags.useTileIndex        = pCreateIn->createFlags.useTileIndex;
+        pLib->m_configFlags.useCombinedSwizzle  = pCreateIn->createFlags.useCombinedSwizzle;
+        pLib->m_configFlags.checkLast2DLevel    = pCreateIn->createFlags.checkLast2DLevel;
+        pLib->m_configFlags.useHtileSliceAlign  = pCreateIn->createFlags.useHtileSliceAlign;
+        pLib->m_configFlags.degradeBaseLevel    = pCreateIn->createFlags.degradeBaseLevel;
+        pLib->m_configFlags.allowLargeThickTile = pCreateIn->createFlags.allowLargeThickTile;
+
+        pLib->SetAddrChipFamily(pCreateIn->chipFamily, pCreateIn->chipRevision);
+
+        pLib->SetMinPitchAlignPixels(pCreateIn->minPitchAlignPixels);
+
+        // Global parameters initialized and remaining configFlags bits are set as well
+        initValid = pLib->HwlInitGlobalParams(pCreateIn);
+
+        if (initValid)
+        {
+            pLib->m_pElemLib = AddrElemLib::Create(pLib);
+        }
+        else
+        {
+            pLib->m_pElemLib = NULL; // Don't go on allocating element lib
+            returnCode = ADDR_INVALIDGBREGVALUES;
+        }
+
+        if (pLib->m_pElemLib == NULL)
+        {
+            delete pLib;
+            pLib = NULL;
+            ADDR_ASSERT_ALWAYS();
+        }
+        else
+        {
+            pLib->m_pElemLib->SetConfigFlags(pLib->m_configFlags);
+        }
+    }
+
+    pCreateOut->hLib = pLib;
+
+    if ((pLib == NULL) &&
+        (returnCode == ADDR_OK))
+    {
+        // Unknown failures, we return the general error code
+        returnCode = ADDR_ERROR;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::SetAddrChipFamily
+*
+*   @brief
+*       Convert familyID defined in atiid.h to AddrChipFamily and set m_chipFamily/m_chipRevision
+*   @return
+*      N/A
+***************************************************************************************************
+*/
+VOID AddrLib::SetAddrChipFamily(
+    UINT_32 uChipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 uChipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
+{
+    AddrChipFamily family = ADDR_CHIP_FAMILY_IVLD;
+
+    family = HwlConvertChipFamily(uChipFamily, uChipRevision);
+
+    ADDR_ASSERT(family != ADDR_CHIP_FAMILY_IVLD);
+
+    m_chipFamily    = family;
+    m_chipRevision  = uChipRevision;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::SetMinPitchAlignPixels
+*
+*   @brief
+*       Set m_minPitchAlignPixels with input param
+*
+*   @return
+*      N/A
+***************************************************************************************************
+*/
+VOID AddrLib::SetMinPitchAlignPixels(
+    UINT_32 minPitchAlignPixels)    ///< [in] minmum pitch alignment in pixels
+{
+    m_minPitchAlignPixels = (minPitchAlignPixels == 0)? 1 : minPitchAlignPixels;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::GetAddrLib
+*
+*   @brief
+*       Get AddrLib pointer
+*
+*   @return
+*      An AddrLib class pointer
+***************************************************************************************************
+*/
+AddrLib * AddrLib::GetAddrLib(
+    ADDR_HANDLE hLib)   ///< [in] handle of ADDR_HANDLE
+{
+    return static_cast<AddrLib *>(hLib);
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Surface Methods
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceInfo
+*
+*   @brief
+*       Interface function stub of AddrComputeSurfaceInfo.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeSurfaceInfo(
+     const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,    ///< [in] input structure
+     ADDR_COMPUTE_SURFACE_INFO_OUTPUT*      pOut    ///< [out] output structure
+     ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    // We suggest client do sanity check but a check here is also good
+    if (pIn->bpp > 128)
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    // Thick modes don't support multisample
+    if (ComputeSurfaceThickness(pIn->tileMode) > 1 && pIn->numSamples > 1)
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        // Get a local copy of input structure and only reference pIn for unadjusted values
+        ADDR_COMPUTE_SURFACE_INFO_INPUT localIn = *pIn;
+        ADDR_TILEINFO tileInfoNull = {0};
+
+        if (UseTileInfo())
+        {
+            // If the original input has a valid ADDR_TILEINFO pointer then copy its contents.
+            // Otherwise the default 0's in tileInfoNull are used.
+            if (pIn->pTileInfo)
+            {
+                tileInfoNull = *pIn->pTileInfo;
+            }
+            localIn.pTileInfo  = &tileInfoNull;
+        }
+
+        localIn.numSamples = pIn->numSamples == 0 ? 1 : pIn->numSamples;
+
+        // Do mipmap check first
+        // If format is BCn, pre-pad dimension to power-of-two according to HWL
+        ComputeMipLevel(&localIn);
+
+        if (m_configFlags.checkLast2DLevel)
+        {
+            // Save this level's original height in pixels
+            pOut->height = pIn->height;
+        }
+
+        UINT_32 expandX = 1;
+        UINT_32 expandY = 1;
+        AddrElemMode elemMode;
+
+        // Save outputs that may not go through HWL
+        pOut->pixelBits = localIn.bpp;
+        pOut->numSamples = localIn.numSamples;
+        pOut->last2DLevel = FALSE;
+
+#if !ALT_TEST
+        if (localIn.numSamples > 1)
+        {
+            ADDR_ASSERT(localIn.mipLevel == 0);
+        }
+#endif
+
+        if (localIn.format != ADDR_FMT_INVALID) // Set format to INVALID will skip this conversion
+        {
+            // Get compression/expansion factors and element mode
+            // (which indicates compression/expansion
+            localIn.bpp = GetElemLib()->GetBitsPerPixel(localIn.format,
+                                                        &elemMode,
+                                                        &expandX,
+                                                        &expandY);
+
+            // Special flag for 96 bit surface. 96 (or 48 if we support) bit surface's width is
+            // pre-multiplied by 3 and bpp is divided by 3. So pitch alignment for linear-
+            // aligned does not meet 64-pixel in real. We keep special handling in hwl since hw
+            // restrictions are different.
+            // Also Mip 1+ needs an element pitch of 32 bits so we do not need this workaround
+            // but we use this flag to skip RestoreSurfaceInfo below
+
+            if ((elemMode == ADDR_EXPANDED) &&
+                (expandX > 1))
+            {
+                ADDR_ASSERT(localIn.tileMode == ADDR_TM_LINEAR_ALIGNED || localIn.height == 1);
+            }
+
+            GetElemLib()->AdjustSurfaceInfo(elemMode,
+                                            expandX,
+                                            expandY,
+                                            &localIn.bpp,
+                                            &localIn.basePitch,
+                                            &localIn.width,
+                                            &localIn.height);
+
+            // Overwrite these parameters if we have a valid format
+        }
+        else if (localIn.bpp != 0)
+        {
+            localIn.width  = (localIn.width != 0) ? localIn.width : 1;
+            localIn.height = (localIn.height != 0) ? localIn.height : 1;
+        }
+        else // Rule out some invalid parameters
+        {
+            ADDR_ASSERT_ALWAYS();
+
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+
+        // Check mipmap after surface expansion
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = PostComputeMipLevel(&localIn, pOut);
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            if (UseTileIndex(localIn.tileIndex))
+            {
+                // Make sure pTileInfo is not NULL
+                ADDR_ASSERT(localIn.pTileInfo);
+
+                UINT_32 numSamples = GetNumFragments(localIn.numSamples, localIn.numFrags);
+
+                INT_32 macroModeIndex = TileIndexNoMacroIndex;
+
+                if (localIn.tileIndex != TileIndexLinearGeneral)
+                {
+                    // Try finding a macroModeIndex
+                    macroModeIndex = HwlComputeMacroModeIndex(localIn.tileIndex,
+                                                              localIn.flags,
+                                                              localIn.bpp,
+                                                              numSamples,
+                                                              localIn.pTileInfo,
+                                                              &localIn.tileMode,
+                                                              &localIn.tileType);
+                }
+
+                // If macroModeIndex is not needed, then call HwlSetupTileCfg to get tile info
+                if (macroModeIndex == TileIndexNoMacroIndex)
+                {
+                    returnCode = HwlSetupTileCfg(localIn.tileIndex, macroModeIndex,
+                                                 localIn.pTileInfo,
+                                                 &localIn.tileMode, &localIn.tileType);
+                }
+                // If macroModeIndex is invalid, then assert this is not macro tiled
+                else if (macroModeIndex == TileIndexInvalid)
+                {
+                    ADDR_ASSERT(!IsMacroTiled(localIn.tileMode));
+                }
+            }
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            AddrTileMode tileMode = localIn.tileMode;
+            AddrTileType tileType = localIn.tileType;
+
+            // HWL layer may override tile mode if necessary
+            if (HwlOverrideTileMode(&localIn, &tileMode, &tileType))
+            {
+                localIn.tileMode = tileMode;
+                localIn.tileType = tileType;
+            }
+            // Degrade base level if applicable
+            if (DegradeBaseLevel(&localIn, &tileMode))
+            {
+                localIn.tileMode = tileMode;
+            }
+        }
+
+        // Call main function to compute surface info
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlComputeSurfaceInfo(&localIn, pOut);
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            // Since bpp might be changed we just pass it through
+            pOut->bpp  = localIn.bpp;
+
+            // Also original width/height/bpp
+            pOut->pixelPitch    = pOut->pitch;
+            pOut->pixelHeight   = pOut->height;
+
+#if DEBUG
+            if (localIn.flags.display)
+            {
+                ADDR_ASSERT((pOut->pitchAlign % 32) == 0);
+            }
+#endif //DEBUG
+
+            if (localIn.format != ADDR_FMT_INVALID)
+            {
+                //
+                // 96 bits surface of level 1+ requires element pitch of 32 bits instead
+                // In hwl function we skip multiplication of 3 then we should skip division of 3
+                // We keep pitch that represents 32 bit element instead of 96 bits since we
+                // will get an odd number if divided by 3.
+                //
+                if (!((expandX == 3) && (localIn.mipLevel > 0)))
+                {
+
+                    GetElemLib()->RestoreSurfaceInfo(elemMode,
+                                                     expandX,
+                                                     expandY,
+                                                     &localIn.bpp,
+                                                     &pOut->pixelPitch,
+                                                     &pOut->pixelHeight);
+                }
+            }
+
+            if (localIn.flags.qbStereo)
+            {
+                if (pOut->pStereoInfo)
+                {
+                    ComputeQbStereoInfo(pOut);
+                }
+            }
+
+            if (localIn.flags.volume) // For volume sliceSize equals to all z-slices
+            {
+                pOut->sliceSize = pOut->surfSize;
+            }
+            else // For array: sliceSize is likely to have slice-padding (the last one)
+            {
+                pOut->sliceSize = pOut->surfSize / pOut->depth;
+
+                // array or cubemap
+                if (pIn->numSlices > 1)
+                {
+                    // If this is the last slice then add the padding size to this slice
+                    if (pIn->slice == (pIn->numSlices - 1))
+                    {
+                        pOut->sliceSize += pOut->sliceSize * (pOut->depth - pIn->numSlices);
+                    }
+                    else if (m_configFlags.checkLast2DLevel)
+                    {
+                        // Reset last2DLevel flag if this is not the last array slice
+                        pOut->last2DLevel = FALSE;
+                    }
+                }
+            }
+
+            pOut->pitchTileMax = pOut->pitch / 8 - 1;
+            pOut->heightTileMax = pOut->height / 8 - 1;
+            pOut->sliceTileMax = pOut->pitch * pOut->height / 64 - 1;
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceInfo
+*
+*   @brief
+*       Interface function stub of AddrComputeSurfaceInfo.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeSurfaceAddrFromCoord(
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            const ADDR_SURFACE_FLAGS flags = {{0}};
+            UINT_32 numSamples = GetNumFragments(pIn->numSamples, pIn->numFrags);
+
+            // Try finding a macroModeIndex
+            INT_32 macroModeIndex = HwlComputeMacroModeIndex(input.tileIndex,
+                                                             flags,
+                                                             input.bpp,
+                                                             numSamples,
+                                                             input.pTileInfo,
+                                                             &input.tileMode,
+                                                             &input.tileType);
+
+            // If macroModeIndex is not needed, then call HwlSetupTileCfg to get tile info
+            if (macroModeIndex == TileIndexNoMacroIndex)
+            {
+                returnCode = HwlSetupTileCfg(input.tileIndex, macroModeIndex,
+                                             input.pTileInfo, &input.tileMode, &input.tileType);
+            }
+            // If macroModeIndex is invalid, then assert this is not macro tiled
+            else if (macroModeIndex == TileIndexInvalid)
+            {
+                ADDR_ASSERT(!IsMacroTiled(input.tileMode));
+            }
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlComputeSurfaceAddrFromCoord(pIn, pOut);
+
+            if (returnCode == ADDR_OK)
+            {
+                pOut->prtBlockIndex = static_cast<UINT_32>(pOut->addr / (64 * 1024));
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceCoordFromAddr
+*
+*   @brief
+*       Interface function stub of ComputeSurfaceCoordFromAddr.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeSurfaceCoordFromAddr(
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            const ADDR_SURFACE_FLAGS flags = {{0}};
+            UINT_32 numSamples = GetNumFragments(pIn->numSamples, pIn->numFrags);
+
+            // Try finding a macroModeIndex
+            INT_32 macroModeIndex = HwlComputeMacroModeIndex(input.tileIndex,
+                                                             flags,
+                                                             input.bpp,
+                                                             numSamples,
+                                                             input.pTileInfo,
+                                                             &input.tileMode,
+                                                             &input.tileType);
+
+            // If macroModeIndex is not needed, then call HwlSetupTileCfg to get tile info
+            if (macroModeIndex == TileIndexNoMacroIndex)
+            {
+                returnCode = HwlSetupTileCfg(input.tileIndex, macroModeIndex,
+                                             input.pTileInfo, &input.tileMode, &input.tileType);
+            }
+            // If macroModeIndex is invalid, then assert this is not macro tiled
+            else if (macroModeIndex == TileIndexInvalid)
+            {
+                ADDR_ASSERT(!IsMacroTiled(input.tileMode));
+            }
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlComputeSurfaceCoordFromAddr(pIn, pOut);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSliceTileSwizzle
+*
+*   @brief
+*       Interface function stub of ComputeSliceTileSwizzle.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeSliceTileSwizzle(
+    const ADDR_COMPUTE_SLICESWIZZLE_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SLICESWIZZLE_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_SLICESWIZZLE_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_SLICESWIZZLE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_SLICESWIZZLE_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex,
+                                         input.pTileInfo, &input.tileMode);
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlComputeSliceTileSwizzle(pIn, pOut);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ExtractBankPipeSwizzle
+*
+*   @brief
+*       Interface function stub of AddrExtractBankPipeSwizzle.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ExtractBankPipeSwizzle(
+    const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT*  pIn,    ///< [in] input structure
+    ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT)) ||
+            (pOut->size != sizeof(ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlExtractBankPipeSwizzle(pIn, pOut);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::CombineBankPipeSwizzle
+*
+*   @brief
+*       Interface function stub of AddrCombineBankPipeSwizzle.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::CombineBankPipeSwizzle(
+    const ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_FMASK_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_FMASK_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlCombineBankPipeSwizzle(pIn->bankSwizzle,
+                                                   pIn->pipeSwizzle,
+                                                   pIn->pTileInfo,
+                                                   pIn->baseAddr,
+                                                   &pOut->tileSwizzle);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeBaseSwizzle
+*
+*   @brief
+*       Interface function stub of AddrCompueBaseSwizzle.
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeBaseSwizzle(
+    const ADDR_COMPUTE_BASE_SWIZZLE_INPUT*  pIn,
+    ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_BASE_SWIZZLE_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            if (IsMacroTiled(pIn->tileMode))
+            {
+                returnCode = HwlComputeBaseSwizzle(pIn, pOut);
+            }
+            else
+            {
+                pOut->tileSwizzle = 0;
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeFmaskInfo
+*
+*   @brief
+*       Interface function stub of ComputeFmaskInfo.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeFmaskInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut    ///< [out] output structure
+    )
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_FMASK_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_FMASK_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    // No thick MSAA
+    if (ComputeSurfaceThickness(pIn->tileMode) > 1)
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_FMASK_INFO_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+
+            if (pOut->pTileInfo)
+            {
+                // Use temp tile info for calcalation
+                input.pTileInfo = pOut->pTileInfo;
+            }
+            else
+            {
+                input.pTileInfo = &tileInfoNull;
+            }
+
+            ADDR_SURFACE_FLAGS flags = {{0}};
+            flags.fmask = 1;
+
+            // Try finding a macroModeIndex
+            INT_32 macroModeIndex = HwlComputeMacroModeIndex(pIn->tileIndex,
+                                                             flags,
+                                                             HwlComputeFmaskBits(pIn, NULL),
+                                                             pIn->numSamples,
+                                                             input.pTileInfo,
+                                                             &input.tileMode);
+
+            // If macroModeIndex is not needed, then call HwlSetupTileCfg to get tile info
+            if (macroModeIndex == TileIndexNoMacroIndex)
+            {
+                returnCode = HwlSetupTileCfg(input.tileIndex, macroModeIndex,
+                                             input.pTileInfo, &input.tileMode);
+            }
+
+            ADDR_ASSERT(macroModeIndex != TileIndexInvalid);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            if (pIn->numSamples > 1)
+            {
+                returnCode = HwlComputeFmaskInfo(pIn, pOut);
+            }
+            else
+            {
+                memset(pOut, 0, sizeof(ADDR_COMPUTE_FMASK_INFO_OUTPUT));
+
+                returnCode = ADDR_INVALIDPARAMS;
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeFmaskAddrFromCoord
+*
+*   @brief
+*       Interface function stub of ComputeFmaskAddrFromCoord.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeFmaskAddrFromCoord(
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_ASSERT(pIn->numSamples > 1);
+
+        if (pIn->numSamples > 1)
+        {
+            returnCode = HwlComputeFmaskAddrFromCoord(pIn, pOut);
+        }
+        else
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeFmaskCoordFromAddr
+*
+*   @brief
+*       Interface function stub of ComputeFmaskAddrFromCoord.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeFmaskCoordFromAddr(
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*  pIn,     ///< [in] input structure
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut           ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_ASSERT(pIn->numSamples > 1);
+
+        if (pIn->numSamples > 1)
+        {
+            returnCode = HwlComputeFmaskCoordFromAddr(pIn, pOut);
+        }
+        else
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ConvertTileInfoToHW
+*
+*   @brief
+*       Convert tile info from real value to HW register value in HW layer
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ConvertTileInfoToHW(
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn, ///< [in] input structure
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_CONVERT_TILEINFOTOHW_INPUT)) ||
+            (pOut->size != sizeof(ADDR_CONVERT_TILEINFOTOHW_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_CONVERT_TILEINFOTOHW_INPUT input;
+        // if pIn->reverse is TRUE, indices are ignored
+        if (pIn->reverse == FALSE && UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = HwlConvertTileInfoToHW(pIn, pOut);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ConvertTileIndex
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ConvertTileIndex(
+    const ADDR_CONVERT_TILEINDEX_INPUT* pIn, ///< [in] input structure
+    ADDR_CONVERT_TILEINDEX_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_CONVERT_TILEINDEX_INPUT)) ||
+            (pOut->size != sizeof(ADDR_CONVERT_TILEINDEX_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+
+        returnCode = HwlSetupTileCfg(pIn->tileIndex, pIn->macroModeIndex,
+                                     pOut->pTileInfo, &pOut->tileMode, &pOut->tileType);
+
+        if (returnCode == ADDR_OK && pIn->tileInfoHw)
+        {
+            ADDR_CONVERT_TILEINFOTOHW_INPUT hwInput = {0};
+            ADDR_CONVERT_TILEINFOTOHW_OUTPUT hwOutput = {0};
+
+            hwInput.pTileInfo = pOut->pTileInfo;
+            hwInput.tileIndex = -1;
+            hwOutput.pTileInfo = pOut->pTileInfo;
+
+            returnCode = HwlConvertTileInfoToHW(&hwInput, &hwOutput);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ConvertTileIndex1
+*
+*   @brief
+*       Convert tile index to tile mode/type/info
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ConvertTileIndex1(
+    const ADDR_CONVERT_TILEINDEX1_INPUT* pIn,   ///< [in] input structure
+    ADDR_CONVERT_TILEINDEX_OUTPUT* pOut         ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_CONVERT_TILEINDEX1_INPUT)) ||
+            (pOut->size != sizeof(ADDR_CONVERT_TILEINDEX_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_SURFACE_FLAGS flags = {{0}};
+
+        HwlComputeMacroModeIndex(pIn->tileIndex, flags, pIn->bpp, pIn->numSamples,
+                                 pOut->pTileInfo, &pOut->tileMode, &pOut->tileType);
+
+        if (pIn->tileInfoHw)
+        {
+            ADDR_CONVERT_TILEINFOTOHW_INPUT hwInput = {0};
+            ADDR_CONVERT_TILEINFOTOHW_OUTPUT hwOutput = {0};
+
+            hwInput.pTileInfo = pOut->pTileInfo;
+            hwInput.tileIndex = -1;
+            hwOutput.pTileInfo = pOut->pTileInfo;
+
+            returnCode = HwlConvertTileInfoToHW(&hwInput, &hwOutput);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::GetTileIndex
+*
+*   @brief
+*       Get tile index from tile mode/type/info
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::GetTileIndex(
+    const ADDR_GET_TILEINDEX_INPUT* pIn, ///< [in] input structure
+    ADDR_GET_TILEINDEX_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_GET_TILEINDEX_INPUT)) ||
+            (pOut->size != sizeof(ADDR_GET_TILEINDEX_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        returnCode = HwlGetTileIndex(pIn, pOut);
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceThickness
+*
+*   @brief
+*       Compute surface thickness
+*
+*   @return
+*       Surface thickness
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputeSurfaceThickness(
+    AddrTileMode tileMode)    ///< [in] tile mode
+{
+    return m_modeFlags[tileMode].thickness;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               CMASK/HTILE
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeHtileInfo
+*
+*   @brief
+*       Interface function stub of AddrComputeHtilenfo
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeHtileInfo(
+    const ADDR_COMPUTE_HTILE_INFO_INPUT*    pIn,    ///< [in] input structure
+    ADDR_COMPUTE_HTILE_INFO_OUTPUT*         pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    BOOL_32 isWidth8  = (pIn->blockWidth == 8) ? TRUE : FALSE;
+    BOOL_32 isHeight8 = (pIn->blockHeight == 8) ? TRUE : FALSE;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_HTILE_INFO_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            pOut->bpp = ComputeHtileInfo(pIn->flags,
+                                         pIn->pitch,
+                                         pIn->height,
+                                         pIn->numSlices,
+                                         pIn->isLinear,
+                                         isWidth8,
+                                         isHeight8,
+                                         pIn->pTileInfo,
+                                         &pOut->pitch,
+                                         &pOut->height,
+                                         &pOut->htileBytes,
+                                         &pOut->macroWidth,
+                                         &pOut->macroHeight,
+                                         &pOut->sliceSize,
+                                         &pOut->baseAlign);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskInfo
+*
+*   @brief
+*       Interface function stub of AddrComputeCmaskInfo
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeCmaskInfo(
+    const ADDR_COMPUTE_CMASK_INFO_INPUT*    pIn,    ///< [in] input structure
+    ADDR_COMPUTE_CMASK_INFO_OUTPUT*         pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_CMASK_INFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_CMASK_INFO_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_CMASK_INFO_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            returnCode = ComputeCmaskInfo(pIn->flags,
+                                          pIn->pitch,
+                                          pIn->height,
+                                          pIn->numSlices,
+                                          pIn->isLinear,
+                                          pIn->pTileInfo,
+                                          &pOut->pitch,
+                                          &pOut->height,
+                                          &pOut->cmaskBytes,
+                                          &pOut->macroWidth,
+                                          &pOut->macroHeight,
+                                          &pOut->sliceSize,
+                                          &pOut->baseAlign,
+                                          &pOut->blockMax);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeDccInfo
+*
+*   @brief
+*       Interface function to compute DCC key info
+*
+*   @return
+*       return code of HwlComputeDccInfo
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeDccInfo(
+    const ADDR_COMPUTE_DCCINFO_INPUT*    pIn,    ///< [in] input structure
+    ADDR_COMPUTE_DCCINFO_OUTPUT*         pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE ret = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_DCCINFO_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT)))
+        {
+            ret = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (ret == ADDR_OK)
+    {
+        ADDR_COMPUTE_DCCINFO_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+
+            ret = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex,
+                                  &input.tileInfo, &input.tileMode);
+
+            pIn = &input;
+        }
+
+        if (ADDR_OK == ret)
+        {
+            ret = HwlComputeDccInfo(pIn, pOut);
+        }
+    }
+
+    return ret;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeHtileAddrFromCoord
+*
+*   @brief
+*       Interface function stub of AddrComputeHtileAddrFromCoord
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeHtileAddrFromCoord(
+    const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    BOOL_32 isWidth8  = (pIn->blockWidth == 8) ? TRUE : FALSE;
+    BOOL_32 isHeight8 = (pIn->blockHeight == 8) ? TRUE : FALSE;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            pOut->addr = HwlComputeXmaskAddrFromCoord(pIn->pitch,
+                                                      pIn->height,
+                                                      pIn->x,
+                                                      pIn->y,
+                                                      pIn->slice,
+                                                      pIn->numSlices,
+                                                      1,
+                                                      pIn->isLinear,
+                                                      isWidth8,
+                                                      isHeight8,
+                                                      pIn->pTileInfo,
+                                                      &pOut->bitPosition);
+        }
+    }
+
+    return returnCode;
+
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeHtileCoordFromAddr
+*
+*   @brief
+*       Interface function stub of AddrComputeHtileCoordFromAddr
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeHtileCoordFromAddr(
+    const ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    BOOL_32 isWidth8  = (pIn->blockWidth == 8) ? TRUE : FALSE;
+    BOOL_32 isHeight8 = (pIn->blockHeight == 8) ? TRUE : FALSE;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            HwlComputeXmaskCoordFromAddr(pIn->addr,
+                                         pIn->bitPosition,
+                                         pIn->pitch,
+                                         pIn->height,
+                                         pIn->numSlices,
+                                         1,
+                                         pIn->isLinear,
+                                         isWidth8,
+                                         isHeight8,
+                                         pIn->pTileInfo,
+                                         &pOut->x,
+                                         &pOut->y,
+                                         &pOut->slice);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskAddrFromCoord
+*
+*   @brief
+*       Interface function stub of AddrComputeCmaskAddrFromCoord
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeCmaskAddrFromCoord(
+    const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            if (pIn->flags.tcCompatible == TRUE)
+            {
+                returnCode = HwlComputeCmaskAddrFromCoord(pIn, pOut);
+            }
+            else
+            {
+                pOut->addr = HwlComputeXmaskAddrFromCoord(pIn->pitch,
+                                                          pIn->height,
+                                                          pIn->x,
+                                                          pIn->y,
+                                                          pIn->slice,
+                                                          pIn->numSlices,
+                                                          2,
+                                                          pIn->isLinear,
+                                                          FALSE, //this is cmask, isWidth8 is not needed
+                                                          FALSE, //this is cmask, isHeight8 is not needed
+                                                          pIn->pTileInfo,
+                                                          &pOut->bitPosition);
+            }
+
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskCoordFromAddr
+*
+*   @brief
+*       Interface function stub of AddrComputeCmaskCoordFromAddr
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeCmaskCoordFromAddr(
+    const ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT)) ||
+            (pOut->size != sizeof(ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        ADDR_TILEINFO tileInfoNull;
+        ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT input;
+
+        if (UseTileIndex(pIn->tileIndex))
+        {
+            input = *pIn;
+            // Use temp tile info for calcalation
+            input.pTileInfo = &tileInfoNull;
+
+            returnCode = HwlSetupTileCfg(input.tileIndex, input.macroModeIndex, input.pTileInfo);
+
+            // Change the input structure
+            pIn = &input;
+        }
+
+        if (returnCode == ADDR_OK)
+        {
+            HwlComputeXmaskCoordFromAddr(pIn->addr,
+                                         pIn->bitPosition,
+                                         pIn->pitch,
+                                         pIn->height,
+                                         pIn->numSlices,
+                                         2,
+                                         pIn->isLinear,
+                                         FALSE,
+                                         FALSE,
+                                         pIn->pTileInfo,
+                                         &pOut->x,
+                                         &pOut->y,
+                                         &pOut->slice);
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeTileDataWidthAndHeight
+*
+*   @brief
+*       Compute the squared cache shape for per-tile data (CMASK and HTILE)
+*
+*   @return
+*       N/A
+*
+*   @note
+*       MacroWidth and macroHeight are measured in pixels
+***************************************************************************************************
+*/
+VOID AddrLib::ComputeTileDataWidthAndHeight(
+    UINT_32         bpp,             ///< [in] bits per pixel
+    UINT_32         cacheBits,       ///< [in] bits of cache
+    ADDR_TILEINFO*  pTileInfo,       ///< [in] Tile info
+    UINT_32*        pMacroWidth,     ///< [out] macro tile width
+    UINT_32*        pMacroHeight     ///< [out] macro tile height
+    ) const
+{
+    UINT_32 height = 1;
+    UINT_32 width  = cacheBits / bpp;
+    UINT_32 pipes  = HwlGetPipes(pTileInfo);
+
+    // Double height until the macro-tile is close to square
+    // Height can only be doubled if width is even
+
+    while ((width > height * 2 * pipes) && !(width & 1))
+    {
+        width  /= 2;
+        height *= 2;
+    }
+
+    *pMacroWidth  = 8 * width;
+    *pMacroHeight = 8 * height * pipes;
+
+    // Note: The above iterative comptuation is equivalent to the following
+    //
+    //int log2_height = ((log2(cacheBits)-log2(bpp)-log2(pipes))/2);
+    //int macroHeight = pow2( 3+log2(pipes)+log2_height );
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlComputeTileDataWidthAndHeightLinear
+*
+*   @brief
+*       Compute the squared cache shape for per-tile data (CMASK and HTILE) for linear layout
+*
+*   @return
+*       N/A
+*
+*   @note
+*       MacroWidth and macroHeight are measured in pixels
+***************************************************************************************************
+*/
+VOID AddrLib::HwlComputeTileDataWidthAndHeightLinear(
+    UINT_32*        pMacroWidth,     ///< [out] macro tile width
+    UINT_32*        pMacroHeight,    ///< [out] macro tile height
+    UINT_32         bpp,             ///< [in] bits per pixel
+    ADDR_TILEINFO*  pTileInfo        ///< [in] tile info
+    ) const
+{
+    ADDR_ASSERT(bpp != 4);              // Cmask does not support linear layout prior to SI
+    *pMacroWidth  = 8 * 512 / bpp;      // Align width to 512-bit memory accesses
+    *pMacroHeight = 8 * m_pipes;        // Align height to number of pipes
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeHtileInfo
+*
+*   @brief
+*       Compute htile pitch,width, bytes per 2D slice
+*
+*   @return
+*       Htile bpp i.e. How many bits for an 8x8 tile
+*       Also returns by output parameters:
+*       *Htile pitch, height, total size in bytes, macro-tile dimensions and slice size*
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputeHtileInfo(
+    ADDR_HTILE_FLAGS flags,             ///< [in] htile flags
+    UINT_32          pitchIn,           ///< [in] pitch input
+    UINT_32          heightIn,          ///< [in] height input
+    UINT_32          numSlices,         ///< [in] number of slices
+    BOOL_32          isLinear,          ///< [in] if it is linear mode
+    BOOL_32          isWidth8,          ///< [in] if htile block width is 8
+    BOOL_32          isHeight8,         ///< [in] if htile block height is 8
+    ADDR_TILEINFO*   pTileInfo,         ///< [in] Tile info
+    UINT_32*         pPitchOut,         ///< [out] pitch output
+    UINT_32*         pHeightOut,        ///< [out] height output
+    UINT_64*         pHtileBytes,       ///< [out] bytes per 2D slice
+    UINT_32*         pMacroWidth,       ///< [out] macro-tile width in pixels
+    UINT_32*         pMacroHeight,      ///< [out] macro-tile width in pixels
+    UINT_64*         pSliceSize,        ///< [out] slice size in bytes
+    UINT_32*         pBaseAlign         ///< [out] base alignment
+    ) const
+{
+
+    UINT_32 macroWidth;
+    UINT_32 macroHeight;
+    UINT_32 baseAlign;
+    UINT_64 surfBytes;
+    UINT_64 sliceBytes;
+
+    numSlices = Max(1u, numSlices);
+
+    const UINT_32 bpp = HwlComputeHtileBpp(isWidth8, isHeight8);
+    const UINT_32 cacheBits = HtileCacheBits;
+
+    if (isLinear)
+    {
+        HwlComputeTileDataWidthAndHeightLinear(&macroWidth,
+                                               &macroHeight,
+                                               bpp,
+                                               pTileInfo);
+    }
+    else
+    {
+        ComputeTileDataWidthAndHeight(bpp,
+                                      cacheBits,
+                                      pTileInfo,
+                                      &macroWidth,
+                                      &macroHeight);
+    }
+
+    *pPitchOut = PowTwoAlign(pitchIn,  macroWidth);
+    *pHeightOut = PowTwoAlign(heightIn,  macroHeight);
+
+    baseAlign = HwlComputeHtileBaseAlign(flags.tcCompatible, isLinear, pTileInfo);
+
+    surfBytes = HwlComputeHtileBytes(*pPitchOut,
+                                     *pHeightOut,
+                                     bpp,
+                                     isLinear,
+                                     numSlices,
+                                     &sliceBytes,
+                                     baseAlign);
+
+    *pHtileBytes = surfBytes;
+
+    //
+    // Use SafeAssign since they are optional
+    //
+    SafeAssign(pMacroWidth, macroWidth);
+
+    SafeAssign(pMacroHeight, macroHeight);
+
+    SafeAssign(pSliceSize,  sliceBytes);
+
+    SafeAssign(pBaseAlign, baseAlign);
+
+    return bpp;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskBaseAlign
+*
+*   @brief
+*       Compute cmask base alignment
+*
+*   @return
+*       Cmask base alignment
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputeCmaskBaseAlign(
+    ADDR_CMASK_FLAGS flags,           ///< [in] Cmask flags
+    ADDR_TILEINFO*   pTileInfo        ///< [in] Tile info
+    ) const
+{
+    UINT_32 baseAlign = m_pipeInterleaveBytes * HwlGetPipes(pTileInfo);
+
+    if (flags.tcCompatible)
+    {
+        ADDR_ASSERT(pTileInfo != NULL);
+        if (pTileInfo)
+        {
+            baseAlign *= pTileInfo->banks;
+        }
+    }
+
+    return baseAlign;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskBytes
+*
+*   @brief
+*       Compute cmask size in bytes
+*
+*   @return
+*       Cmask size in bytes
+***************************************************************************************************
+*/
+UINT_64 AddrLib::ComputeCmaskBytes(
+    UINT_32 pitch,        ///< [in] pitch
+    UINT_32 height,       ///< [in] height
+    UINT_32 numSlices     ///< [in] number of slices
+    ) const
+{
+    return BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * numSlices * CmaskElemBits) /
+        MicroTilePixels;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeCmaskInfo
+*
+*   @brief
+*       Compute cmask pitch,width, bytes per 2D slice
+*
+*   @return
+*       BlockMax. Also by output parameters: Cmask pitch,height, total size in bytes,
+*       macro-tile dimensions
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputeCmaskInfo(
+    ADDR_CMASK_FLAGS flags,            ///< [in] cmask flags
+    UINT_32          pitchIn,           ///< [in] pitch input
+    UINT_32          heightIn,          ///< [in] height input
+    UINT_32          numSlices,         ///< [in] number of slices
+    BOOL_32          isLinear,          ///< [in] is linear mode
+    ADDR_TILEINFO*   pTileInfo,         ///< [in] Tile info
+    UINT_32*         pPitchOut,         ///< [out] pitch output
+    UINT_32*         pHeightOut,        ///< [out] height output
+    UINT_64*         pCmaskBytes,       ///< [out] bytes per 2D slice
+    UINT_32*         pMacroWidth,       ///< [out] macro-tile width in pixels
+    UINT_32*         pMacroHeight,      ///< [out] macro-tile width in pixels
+    UINT_64*         pSliceSize,        ///< [out] slice size in bytes
+    UINT_32*         pBaseAlign,        ///< [out] base alignment
+    UINT_32*         pBlockMax          ///< [out] block max == slice / 128 / 128 - 1
+    ) const
+{
+    UINT_32 macroWidth;
+    UINT_32 macroHeight;
+    UINT_32 baseAlign;
+    UINT_64 surfBytes;
+    UINT_64 sliceBytes;
+
+    numSlices = Max(1u, numSlices);
+
+    const UINT_32 bpp = CmaskElemBits;
+    const UINT_32 cacheBits = CmaskCacheBits;
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (isLinear)
+    {
+        HwlComputeTileDataWidthAndHeightLinear(&macroWidth,
+                                               &macroHeight,
+                                               bpp,
+                                               pTileInfo);
+    }
+    else
+    {
+        ComputeTileDataWidthAndHeight(bpp,
+                                      cacheBits,
+                                      pTileInfo,
+                                      &macroWidth,
+                                      &macroHeight);
+    }
+
+    *pPitchOut = (pitchIn + macroWidth - 1) & ~(macroWidth - 1);
+    *pHeightOut = (heightIn + macroHeight - 1) & ~(macroHeight - 1);
+
+
+    sliceBytes = ComputeCmaskBytes(*pPitchOut,
+                                   *pHeightOut,
+                                   1);
+
+    baseAlign = ComputeCmaskBaseAlign(flags, pTileInfo);
+
+    while (sliceBytes % baseAlign)
+    {
+        *pHeightOut += macroHeight;
+
+        sliceBytes = ComputeCmaskBytes(*pPitchOut,
+                                       *pHeightOut,
+                                       1);
+    }
+
+    surfBytes = sliceBytes * numSlices;
+
+    *pCmaskBytes = surfBytes;
+
+    //
+    // Use SafeAssign since they are optional
+    //
+    SafeAssign(pMacroWidth, macroWidth);
+
+    SafeAssign(pMacroHeight, macroHeight);
+
+    SafeAssign(pBaseAlign, baseAlign);
+
+    SafeAssign(pSliceSize, sliceBytes);
+
+    UINT_32 slice = (*pPitchOut) * (*pHeightOut);
+    UINT_32 blockMax = slice / 128 / 128 - 1;
+
+#if DEBUG
+    if (slice % (64*256) != 0)
+    {
+        ADDR_ASSERT_ALWAYS();
+    }
+#endif //DEBUG
+
+    UINT_32 maxBlockMax = HwlGetMaxCmaskBlockMax();
+
+    if (blockMax > maxBlockMax)
+    {
+        blockMax = maxBlockMax;
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    SafeAssign(pBlockMax, blockMax);
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeXmaskCoordYFromPipe
+*
+*   @brief
+*       Compute the Y coord from pipe number for cmask/htile
+*
+*   @return
+*       Y coordinate
+*
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputeXmaskCoordYFromPipe(
+    UINT_32         pipe,       ///< [in] pipe number
+    UINT_32         x           ///< [in] x coordinate
+    ) const
+{
+    UINT_32 pipeBit0;
+    UINT_32 pipeBit1;
+    UINT_32 xBit0;
+    UINT_32 xBit1;
+    UINT_32 yBit0;
+    UINT_32 yBit1;
+
+    UINT_32 y = 0;
+
+    UINT_32 numPipes = m_pipes; // SI has its implementation
+    //
+    // Convert pipe + x to y coordinate.
+    //
+    switch (numPipes)
+    {
+        case 1:
+            //
+            // 1 pipe
+            //
+            // p0 = 0
+            //
+            y = 0;
+            break;
+        case 2:
+            //
+            // 2 pipes
+            //
+            // p0 = x0 ^ y0
+            //
+            // y0 = p0 ^ x0
+            //
+            pipeBit0 = pipe & 0x1;
+
+            xBit0 = x & 0x1;
+
+            yBit0 = pipeBit0 ^ xBit0;
+
+            y = yBit0;
+            break;
+        case 4:
+            //
+            // 4 pipes
+            //
+            // p0 = x1 ^ y0
+            // p1 = x0 ^ y1
+            //
+            // y0 = p0 ^ x1
+            // y1 = p1 ^ x0
+            //
+            pipeBit0 =  pipe & 0x1;
+            pipeBit1 = (pipe & 0x2) >> 1;
+
+            xBit0 =  x & 0x1;
+            xBit1 = (x & 0x2) >> 1;
+
+            yBit0 = pipeBit0 ^ xBit1;
+            yBit1 = pipeBit1 ^ xBit0;
+
+            y = (yBit0 |
+                 (yBit1 << 1));
+            break;
+        case 8:
+            //
+            // 8 pipes
+            //
+            // r600 and r800 have different method
+            //
+            y = HwlComputeXmaskCoordYFrom8Pipe(pipe, x);
+            break;
+        default:
+            break;
+    }
+    return y;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlComputeXmaskCoordFromAddr
+*
+*   @brief
+*       Compute the coord from an address of a cmask/htile
+*
+*   @return
+*       N/A
+*
+*   @note
+*       This method is reused by htile, so rename to Xmask
+***************************************************************************************************
+*/
+VOID AddrLib::HwlComputeXmaskCoordFromAddr(
+    UINT_64         addr,           ///< [in] address
+    UINT_32         bitPosition,    ///< [in] bitPosition in a byte
+    UINT_32         pitch,          ///< [in] pitch
+    UINT_32         height,         ///< [in] height
+    UINT_32         numSlices,      ///< [in] number of slices
+    UINT_32         factor,         ///< [in] factor that indicates cmask or htile
+    BOOL_32         isLinear,       ///< [in] linear or tiled HTILE layout
+    BOOL_32         isWidth8,       ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    BOOL_32         isHeight8,      ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    ADDR_TILEINFO*  pTileInfo,      ///< [in] Tile info
+    UINT_32*        pX,             ///< [out] x coord
+    UINT_32*        pY,             ///< [out] y coord
+    UINT_32*        pSlice          ///< [out] slice index
+    ) const
+{
+    UINT_32 pipe;
+    UINT_32 numPipes;
+    UINT_32 numPipeBits;
+    UINT_32 macroTilePitch;
+    UINT_32 macroTileHeight;
+
+    UINT_64 bitAddr;
+
+    UINT_32 microTileCoordY;
+
+    UINT_32 elemBits;
+
+    UINT_32 pitchAligned = pitch;
+    UINT_32 heightAligned = height;
+    UINT_64 totalBytes;
+
+    UINT_64 elemOffset;
+
+    UINT_64 macroIndex;
+    UINT_32 microIndex;
+
+    UINT_64 macroNumber;
+    UINT_32 microNumber;
+
+    UINT_32 macroX;
+    UINT_32 macroY;
+    UINT_32 macroZ;
+
+    UINT_32 microX;
+    UINT_32 microY;
+
+    UINT_32 tilesPerMacro;
+    UINT_32 macrosPerPitch;
+    UINT_32 macrosPerSlice;
+
+    //
+    // Extract pipe.
+    //
+    numPipes = HwlGetPipes(pTileInfo);
+    pipe = ComputePipeFromAddr(addr, numPipes);
+
+    //
+    // Compute the number of group and pipe bits.
+    //
+    numPipeBits  = Log2(numPipes);
+
+    UINT_32 groupBits = 8 * m_pipeInterleaveBytes;
+    UINT_32 pipes = numPipes;
+
+
+    //
+    // Compute the micro tile size, in bits. And macro tile pitch and height.
+    //
+    if (factor == 2) //CMASK
+    {
+        ADDR_CMASK_FLAGS flags = {{0}};
+
+        elemBits = CmaskElemBits;
+
+        ComputeCmaskInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         pTileInfo,
+                         &pitchAligned,
+                         &heightAligned,
+                         &totalBytes,
+                         &macroTilePitch,
+                         &macroTileHeight);
+    }
+    else  //HTILE
+    {
+        ADDR_HTILE_FLAGS flags = {{0}};
+
+        if (factor != 1)
+        {
+            factor = 1;
+        }
+
+        elemBits = HwlComputeHtileBpp(isWidth8, isHeight8);
+
+        ComputeHtileInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         isWidth8,
+                         isHeight8,
+                         pTileInfo,
+                         &pitchAligned,
+                         &heightAligned,
+                         &totalBytes,
+                         &macroTilePitch,
+                         &macroTileHeight);
+    }
+
+    // Should use aligned dims
+    //
+    pitch = pitchAligned;
+    height = heightAligned;
+
+
+    //
+    // Convert byte address to bit address.
+    //
+    bitAddr = BYTES_TO_BITS(addr) + bitPosition;
+
+
+    //
+    // Remove pipe bits from address.
+    //
+
+    bitAddr = (bitAddr % groupBits) + ((bitAddr/groupBits/pipes)*groupBits);
+
+
+    elemOffset = bitAddr / elemBits;
+
+    tilesPerMacro = (macroTilePitch/factor) * macroTileHeight / MicroTilePixels >> numPipeBits;
+
+    macrosPerPitch = pitch / (macroTilePitch/factor);
+    macrosPerSlice = macrosPerPitch * height / macroTileHeight;
+
+    macroIndex = elemOffset / factor / tilesPerMacro;
+    microIndex = static_cast<UINT_32>(elemOffset % (tilesPerMacro * factor));
+
+    macroNumber = macroIndex * factor + microIndex % factor;
+    microNumber = microIndex / factor;
+
+    macroX = static_cast<UINT_32>((macroNumber % macrosPerPitch));
+    macroY = static_cast<UINT_32>((macroNumber % macrosPerSlice) / macrosPerPitch);
+    macroZ = static_cast<UINT_32>((macroNumber / macrosPerSlice));
+
+
+    microX = microNumber % (macroTilePitch / factor / MicroTileWidth);
+    microY = (microNumber / (macroTilePitch / factor / MicroTileHeight));
+
+    *pX = macroX * (macroTilePitch/factor) + microX * MicroTileWidth;
+    *pY = macroY * macroTileHeight + (microY * MicroTileHeight << numPipeBits);
+    *pSlice = macroZ;
+
+    microTileCoordY = ComputeXmaskCoordYFromPipe(pipe,
+                                                 *pX/MicroTileWidth);
+
+
+    //
+    // Assemble final coordinates.
+    //
+    *pY += microTileCoordY * MicroTileHeight;
+
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlComputeXmaskAddrFromCoord
+*
+*   @brief
+*       Compute the address from an address of cmask (prior to si)
+*
+*   @return
+*       Address in bytes
+*
+***************************************************************************************************
+*/
+UINT_64 AddrLib::HwlComputeXmaskAddrFromCoord(
+    UINT_32        pitch,          ///< [in] pitch
+    UINT_32        height,         ///< [in] height
+    UINT_32        x,              ///< [in] x coord
+    UINT_32        y,              ///< [in] y coord
+    UINT_32        slice,          ///< [in] slice/depth index
+    UINT_32        numSlices,      ///< [in] number of slices
+    UINT_32        factor,         ///< [in] factor that indicates cmask(2) or htile(1)
+    BOOL_32        isLinear,       ///< [in] linear or tiled HTILE layout
+    BOOL_32        isWidth8,       ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    BOOL_32        isHeight8,      ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    ADDR_TILEINFO* pTileInfo,      ///< [in] Tile info
+    UINT_32*       pBitPosition    ///< [out] bit position inside a byte
+    ) const
+{
+    UINT_64 addr;
+    UINT_32 numGroupBits;
+    UINT_32 numPipeBits;
+    UINT_32 newPitch = 0;
+    UINT_32 newHeight = 0;
+    UINT_64 sliceBytes = 0;
+    UINT_64 totalBytes = 0;
+    UINT_64 sliceOffset;
+    UINT_32 pipe;
+    UINT_32 macroTileWidth;
+    UINT_32 macroTileHeight;
+    UINT_32 macroTilesPerRow;
+    UINT_32 macroTileBytes;
+    UINT_32 macroTileIndexX;
+    UINT_32 macroTileIndexY;
+    UINT_64 macroTileOffset;
+    UINT_32 pixelBytesPerRow;
+    UINT_32 pixelOffsetX;
+    UINT_32 pixelOffsetY;
+    UINT_32 pixelOffset;
+    UINT_64 totalOffset;
+    UINT_64 offsetLo;
+    UINT_64 offsetHi;
+    UINT_64 groupMask;
+
+
+    UINT_32 elemBits = 0;
+
+    UINT_32 numPipes = m_pipes; // This function is accessed prior to si only
+
+    if (factor == 2) //CMASK
+    {
+        elemBits = CmaskElemBits;
+
+        // For asics before SI, cmask is always tiled
+        isLinear = FALSE;
+    }
+    else //HTILE
+    {
+        if (factor != 1) // Fix compile warning
+        {
+            factor = 1;
+        }
+
+        elemBits = HwlComputeHtileBpp(isWidth8, isHeight8);
+    }
+
+    //
+    // Compute the number of group bits and pipe bits.
+    //
+    numGroupBits = Log2(m_pipeInterleaveBytes);
+    numPipeBits  = Log2(numPipes);
+
+    //
+    // Compute macro tile dimensions.
+    //
+    if (factor == 2) // CMASK
+    {
+        ADDR_CMASK_FLAGS flags = {{0}};
+
+        ComputeCmaskInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &macroTileWidth,
+                         &macroTileHeight);
+
+        sliceBytes = totalBytes / numSlices;
+    }
+    else // HTILE
+    {
+        ADDR_HTILE_FLAGS flags = {{0}};
+
+        ComputeHtileInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         isWidth8,
+                         isHeight8,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &macroTileWidth,
+                         &macroTileHeight,
+                         &sliceBytes);
+    }
+
+    sliceOffset = slice * sliceBytes;
+
+    //
+    // Get the pipe.  Note that neither slice rotation nor pipe swizzling apply for CMASK.
+    //
+    pipe = ComputePipeFromCoord(x,
+                                y,
+                                0,
+                                ADDR_TM_2D_TILED_THIN1,
+                                0,
+                                FALSE,
+                                pTileInfo);
+
+    //
+    // Compute the number of macro tiles per row.
+    //
+    macroTilesPerRow = newPitch / macroTileWidth;
+
+    //
+    // Compute the number of bytes per macro tile.
+    //
+    macroTileBytes = BITS_TO_BYTES((macroTileWidth * macroTileHeight * elemBits) / MicroTilePixels);
+
+    //
+    // Compute the offset to the macro tile containing the specified coordinate.
+    //
+    macroTileIndexX = x / macroTileWidth;
+    macroTileIndexY = y / macroTileHeight;
+    macroTileOffset = ((macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
+
+    //
+    // Compute the pixel offset within the macro tile.
+    //
+    pixelBytesPerRow = BITS_TO_BYTES(macroTileWidth * elemBits) / MicroTileWidth;
+
+    //
+    // The nibbles are interleaved (see below), so the part of the offset relative to the x
+    // coordinate repeats halfway across the row. (Not for HTILE)
+    //
+    if (factor == 2)
+    {
+        pixelOffsetX = (x % (macroTileWidth / 2)) / MicroTileWidth;
+    }
+    else
+    {
+        pixelOffsetX = (x % (macroTileWidth)) / MicroTileWidth * BITS_TO_BYTES(elemBits);
+    }
+
+    //
+    // Compute the y offset within the macro tile.
+    //
+    pixelOffsetY = (((y % macroTileHeight) / MicroTileHeight) / numPipes) * pixelBytesPerRow;
+
+    pixelOffset = pixelOffsetX + pixelOffsetY;
+
+    //
+    // Combine the slice offset and macro tile offset with the pixel offset, accounting for the
+    // pipe bits in the middle of the address.
+    //
+    totalOffset = ((sliceOffset + macroTileOffset) >> numPipeBits) + pixelOffset;
+
+    //
+    // Split the offset to put some bits below the pipe bits and some above.
+    //
+    groupMask = (1 << numGroupBits) - 1;
+    offsetLo  = totalOffset &  groupMask;
+    offsetHi  = (totalOffset & ~groupMask) << numPipeBits;
+
+    //
+    // Assemble the address from its components.
+    //
+    addr  = offsetLo;
+    addr |= offsetHi;
+    // This is to remove warning with /analyze option
+    UINT_32 pipeBits = pipe << numGroupBits;
+    addr |= pipeBits;
+
+    //
+    // Compute the bit position.  The lower nibble is used when the x coordinate within the macro
+    // tile is less than half of the macro tile width, and the upper nibble is used when the x
+    // coordinate within the macro tile is greater than or equal to half the macro tile width.
+    //
+    *pBitPosition = ((x % macroTileWidth) < (macroTileWidth / factor)) ? 0 : 4;
+
+    return addr;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Surface Addressing Shared
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceAddrFromCoordLinear
+*
+*   @brief
+*       Compute address from coord for linear surface
+*
+*   @return
+*       Address in bytes
+*
+***************************************************************************************************
+*/
+UINT_64 AddrLib::ComputeSurfaceAddrFromCoordLinear(
+    UINT_32  x,              ///< [in] x coord
+    UINT_32  y,              ///< [in] y coord
+    UINT_32  slice,          ///< [in] slice/depth index
+    UINT_32  sample,         ///< [in] sample index
+    UINT_32  bpp,            ///< [in] bits per pixel
+    UINT_32  pitch,          ///< [in] pitch
+    UINT_32  height,         ///< [in] height
+    UINT_32  numSlices,      ///< [in] number of slices
+    UINT_32* pBitPosition    ///< [out] bit position inside a byte
+    ) const
+{
+    const UINT_64 sliceSize = static_cast<UINT_64>(pitch) * height;
+
+    UINT_64 sliceOffset = (slice + sample * numSlices)* sliceSize;
+    UINT_64 rowOffset   = static_cast<UINT_64>(y) * pitch;
+    UINT_64 pixOffset   = x;
+
+    UINT_64 addr = (sliceOffset + rowOffset + pixOffset) * bpp;
+
+    *pBitPosition = static_cast<UINT_32>(addr % 8);
+    addr /= 8;
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceCoordFromAddrLinear
+*
+*   @brief
+*       Compute the coord from an address of a linear surface
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrLib::ComputeSurfaceCoordFromAddrLinear(
+    UINT_64  addr,           ///< [in] address
+    UINT_32  bitPosition,    ///< [in] bitPosition in a byte
+    UINT_32  bpp,            ///< [in] bits per pixel
+    UINT_32  pitch,          ///< [in] pitch
+    UINT_32  height,         ///< [in] height
+    UINT_32  numSlices,      ///< [in] number of slices
+    UINT_32* pX,             ///< [out] x coord
+    UINT_32* pY,             ///< [out] y coord
+    UINT_32* pSlice,         ///< [out] slice/depth index
+    UINT_32* pSample         ///< [out] sample index
+    ) const
+{
+    const UINT_64 sliceSize = static_cast<UINT_64>(pitch) * height;
+    const UINT_64 linearOffset = (BYTES_TO_BITS(addr) + bitPosition) / bpp;
+
+    *pX = static_cast<UINT_32>((linearOffset % sliceSize) % pitch);
+    *pY = static_cast<UINT_32>((linearOffset % sliceSize) / pitch % height);
+    *pSlice  = static_cast<UINT_32>((linearOffset / sliceSize) % numSlices);
+    *pSample = static_cast<UINT_32>((linearOffset / sliceSize) / numSlices);
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeSurfaceCoordFromAddrMicroTiled
+*
+*   @brief
+*       Compute the coord from an address of a micro tiled surface
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrLib::ComputeSurfaceCoordFromAddrMicroTiled(
+    UINT_64         addr,               ///< [in] address
+    UINT_32         bitPosition,        ///< [in] bitPosition in a byte
+    UINT_32         bpp,                ///< [in] bits per pixel
+    UINT_32         pitch,              ///< [in] pitch
+    UINT_32         height,             ///< [in] height
+    UINT_32         numSamples,         ///< [in] number of samples
+    AddrTileMode    tileMode,           ///< [in] tile mode
+    UINT_32         tileBase,           ///< [in] base offset within a tile
+    UINT_32         compBits,           ///< [in] component bits actually needed(for planar surface)
+    UINT_32*        pX,                 ///< [out] x coord
+    UINT_32*        pY,                 ///< [out] y coord
+    UINT_32*        pSlice,             ///< [out] slice/depth index
+    UINT_32*        pSample,            ///< [out] sample index,
+    AddrTileType    microTileType,      ///< [in] micro tiling order
+    BOOL_32         isDepthSampleOrder  ///< [in] TRUE if in depth sample order
+    ) const
+{
+    UINT_64 bitAddr;
+    UINT_32 microTileThickness;
+    UINT_32 microTileBits;
+    UINT_64 sliceBits;
+    UINT_64 rowBits;
+    UINT_32 sliceIndex;
+    UINT_32 microTileCoordX;
+    UINT_32 microTileCoordY;
+    UINT_32 pixelOffset;
+    UINT_32 pixelCoordX = 0;
+    UINT_32 pixelCoordY = 0;
+    UINT_32 pixelCoordZ = 0;
+    UINT_32 pixelCoordS = 0;
+
+    //
+    // Convert byte address to bit address.
+    //
+    bitAddr = BYTES_TO_BITS(addr) + bitPosition;
+
+    //
+    // Compute the micro tile size, in bits.
+    //
+    switch (tileMode)
+    {
+        case ADDR_TM_1D_TILED_THICK:
+            microTileThickness = ThickTileThickness;
+            break;
+        default:
+            microTileThickness = 1;
+            break;
+    }
+
+    microTileBits = MicroTilePixels * microTileThickness * bpp * numSamples;
+
+    //
+    // Compute number of bits per slice and number of bits per row of micro tiles.
+    //
+    sliceBits = static_cast<UINT_64>(pitch) * height * microTileThickness * bpp * numSamples;
+
+    rowBits   = (pitch / MicroTileWidth) * microTileBits;
+
+    //
+    // Extract the slice index.
+    //
+    sliceIndex = static_cast<UINT_32>(bitAddr / sliceBits);
+    bitAddr -= sliceIndex * sliceBits;
+
+    //
+    // Extract the y coordinate of the micro tile.
+    //
+    microTileCoordY = static_cast<UINT_32>(bitAddr / rowBits) * MicroTileHeight;
+    bitAddr -= (microTileCoordY / MicroTileHeight) * rowBits;
+
+    //
+    // Extract the x coordinate of the micro tile.
+    //
+    microTileCoordX = static_cast<UINT_32>(bitAddr / microTileBits) * MicroTileWidth;
+
+    //
+    // Compute the pixel offset within the micro tile.
+    //
+    pixelOffset = static_cast<UINT_32>(bitAddr % microTileBits);
+
+    //
+    // Extract pixel coordinates from the offset.
+    //
+    HwlComputePixelCoordFromOffset(pixelOffset,
+                                   bpp,
+                                   numSamples,
+                                   tileMode,
+                                   tileBase,
+                                   compBits,
+                                   &pixelCoordX,
+                                   &pixelCoordY,
+                                   &pixelCoordZ,
+                                   &pixelCoordS,
+                                   microTileType,
+                                   isDepthSampleOrder);
+
+    //
+    // Assemble final coordinates.
+    //
+    *pX     = microTileCoordX + pixelCoordX;
+    *pY     = microTileCoordY + pixelCoordY;
+    *pSlice = (sliceIndex * microTileThickness) + pixelCoordZ;
+    *pSample = pixelCoordS;
+
+    if (microTileThickness > 1)
+    {
+        *pSample = 0;
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputePipeFromAddr
+*
+*   @brief
+*       Compute the pipe number from an address
+*
+*   @return
+*       Pipe number
+*
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputePipeFromAddr(
+    UINT_64 addr,        ///< [in] address
+    UINT_32 numPipes     ///< [in] number of banks
+    ) const
+{
+    UINT_32 pipe;
+
+    UINT_32 groupBytes = m_pipeInterleaveBytes; //just different terms
+
+    // R600
+    // The LSBs of the address are arranged as follows:
+    //   bank | pipe | group
+    //
+    // To get the pipe number, shift off the group bits and mask the pipe bits.
+    //
+
+    // R800
+    // The LSBs of the address are arranged as follows:
+    //   bank | bankInterleave | pipe | pipeInterleave
+    //
+    // To get the pipe number, shift off the pipe interleave bits and mask the pipe bits.
+    //
+
+    pipe = static_cast<UINT_32>(addr >> Log2(groupBytes)) & (numPipes - 1);
+
+    return pipe;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputePixelIndexWithinMicroTile
+*
+*   @brief
+*       Compute the pixel index inside a micro tile of surface
+*
+*   @return
+*       Pixel index
+*
+***************************************************************************************************
+*/
+UINT_32 AddrLib::ComputePixelIndexWithinMicroTile(
+    UINT_32         x,              ///< [in] x coord
+    UINT_32         y,              ///< [in] y coord
+    UINT_32         z,              ///< [in] slice/depth index
+    UINT_32         bpp,            ///< [in] bits per pixel
+    AddrTileMode    tileMode,       ///< [in] tile mode
+    AddrTileType    microTileType   ///< [in] pixel order in display/non-display mode
+    ) const
+{
+    UINT_32 pixelBit0 = 0;
+    UINT_32 pixelBit1 = 0;
+    UINT_32 pixelBit2 = 0;
+    UINT_32 pixelBit3 = 0;
+    UINT_32 pixelBit4 = 0;
+    UINT_32 pixelBit5 = 0;
+    UINT_32 pixelBit6 = 0;
+    UINT_32 pixelBit7 = 0;
+    UINT_32 pixelBit8 = 0;
+    UINT_32 pixelNumber;
+
+    UINT_32 x0 = _BIT(x, 0);
+    UINT_32 x1 = _BIT(x, 1);
+    UINT_32 x2 = _BIT(x, 2);
+    UINT_32 y0 = _BIT(y, 0);
+    UINT_32 y1 = _BIT(y, 1);
+    UINT_32 y2 = _BIT(y, 2);
+    UINT_32 z0 = _BIT(z, 0);
+    UINT_32 z1 = _BIT(z, 1);
+    UINT_32 z2 = _BIT(z, 2);
+
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    // Compute the pixel number within the micro tile.
+
+    if (microTileType != ADDR_THICK)
+    {
+        if (microTileType == ADDR_DISPLAYABLE)
+        {
+            switch (bpp)
+            {
+                case 8:
+                    pixelBit0 = x0;
+                    pixelBit1 = x1;
+                    pixelBit2 = x2;
+                    pixelBit3 = y1;
+                    pixelBit4 = y0;
+                    pixelBit5 = y2;
+                    break;
+                case 16:
+                    pixelBit0 = x0;
+                    pixelBit1 = x1;
+                    pixelBit2 = x2;
+                    pixelBit3 = y0;
+                    pixelBit4 = y1;
+                    pixelBit5 = y2;
+                    break;
+                case 32:
+                    pixelBit0 = x0;
+                    pixelBit1 = x1;
+                    pixelBit2 = y0;
+                    pixelBit3 = x2;
+                    pixelBit4 = y1;
+                    pixelBit5 = y2;
+                    break;
+                case 64:
+                    pixelBit0 = x0;
+                    pixelBit1 = y0;
+                    pixelBit2 = x1;
+                    pixelBit3 = x2;
+                    pixelBit4 = y1;
+                    pixelBit5 = y2;
+                    break;
+                case 128:
+                    pixelBit0 = y0;
+                    pixelBit1 = x0;
+                    pixelBit2 = x1;
+                    pixelBit3 = x2;
+                    pixelBit4 = y1;
+                    pixelBit5 = y2;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    break;
+            }
+        }
+        else if (microTileType == ADDR_NON_DISPLAYABLE || microTileType == ADDR_DEPTH_SAMPLE_ORDER)
+        {
+            pixelBit0 = x0;
+            pixelBit1 = y0;
+            pixelBit2 = x1;
+            pixelBit3 = y1;
+            pixelBit4 = x2;
+            pixelBit5 = y2;
+        }
+        else if (microTileType == ADDR_ROTATED)
+        {
+            ADDR_ASSERT(thickness == 1);
+
+            switch (bpp)
+            {
+                case 8:
+                    pixelBit0 = y0;
+                    pixelBit1 = y1;
+                    pixelBit2 = y2;
+                    pixelBit3 = x1;
+                    pixelBit4 = x0;
+                    pixelBit5 = x2;
+                    break;
+                case 16:
+                    pixelBit0 = y0;
+                    pixelBit1 = y1;
+                    pixelBit2 = y2;
+                    pixelBit3 = x0;
+                    pixelBit4 = x1;
+                    pixelBit5 = x2;
+                    break;
+                case 32:
+                    pixelBit0 = y0;
+                    pixelBit1 = y1;
+                    pixelBit2 = x0;
+                    pixelBit3 = y2;
+                    pixelBit4 = x1;
+                    pixelBit5 = x2;
+                    break;
+                case 64:
+                    pixelBit0 = y0;
+                    pixelBit1 = x0;
+                    pixelBit2 = y1;
+                    pixelBit3 = x1;
+                    pixelBit4 = x2;
+                    pixelBit5 = y2;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    break;
+            }
+        }
+
+        if (thickness > 1)
+        {
+            pixelBit6 = z0;
+            pixelBit7 = z1;
+        }
+    }
+    else // ADDR_THICK
+    {
+        ADDR_ASSERT(thickness > 1);
+
+        switch (bpp)
+        {
+            case 8:
+            case 16:
+                pixelBit0 = x0;
+                pixelBit1 = y0;
+                pixelBit2 = x1;
+                pixelBit3 = y1;
+                pixelBit4 = z0;
+                pixelBit5 = z1;
+                break;
+            case 32:
+                pixelBit0 = x0;
+                pixelBit1 = y0;
+                pixelBit2 = x1;
+                pixelBit3 = z0;
+                pixelBit4 = y1;
+                pixelBit5 = z1;
+                break;
+            case 64:
+            case 128:
+                pixelBit0 = y0;
+                pixelBit1 = x0;
+                pixelBit2 = z0;
+                pixelBit3 = x1;
+                pixelBit4 = y1;
+                pixelBit5 = z1;
+                break;
+            default:
+                ADDR_ASSERT_ALWAYS();
+                break;
+        }
+
+        pixelBit6 = x2;
+        pixelBit7 = y2;
+    }
+
+    if (thickness == 8)
+    {
+        pixelBit8 = z2;
+    }
+
+    pixelNumber = ((pixelBit0     ) |
+                   (pixelBit1 << 1) |
+                   (pixelBit2 << 2) |
+                   (pixelBit3 << 3) |
+                   (pixelBit4 << 4) |
+                   (pixelBit5 << 5) |
+                   (pixelBit6 << 6) |
+                   (pixelBit7 << 7) |
+                   (pixelBit8 << 8));
+
+    return pixelNumber;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::AdjustPitchAlignment
+*
+*   @brief
+*       Adjusts pitch alignment for flipping surface
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrLib::AdjustPitchAlignment(
+    ADDR_SURFACE_FLAGS  flags,      ///< [in] Surface flags
+    UINT_32*            pPitchAlign ///< [out] Pointer to pitch alignment
+    ) const
+{
+    // Display engine hardwires lower 5 bit of GRPH_PITCH to ZERO which means 32 pixel alignment
+    // Maybe it will be fixed in future but let's make it general for now.
+    if (flags.display || flags.overlay)
+    {
+        *pPitchAlign = PowTwoAlign(*pPitchAlign, 32);
+
+        if(flags.display)
+        {
+            *pPitchAlign = Max(m_minPitchAlignPixels, *pPitchAlign);
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::PadDimensions
+*
+*   @brief
+*       Helper function to pad dimensions
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID AddrLib::PadDimensions(
+    AddrTileMode        tileMode,    ///< [in] tile mode
+    UINT_32             bpp,         ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,       ///< [in] surface flags
+    UINT_32             numSamples,  ///< [in] number of samples
+    ADDR_TILEINFO*      pTileInfo,   ///< [in/out] bank structure.
+    UINT_32             padDims,     ///< [in] Dimensions to pad valid value 1,2,3
+    UINT_32             mipLevel,    ///< [in] MipLevel
+    UINT_32*            pPitch,      ///< [in/out] pitch in pixels
+    UINT_32             pitchAlign,  ///< [in] pitch alignment
+    UINT_32*            pHeight,     ///< [in/out] height in pixels
+    UINT_32             heightAlign, ///< [in] height alignment
+    UINT_32*            pSlices,     ///< [in/out] number of slices
+    UINT_32             sliceAlign   ///< [in] number of slice alignment
+    ) const
+{
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    ADDR_ASSERT(padDims <= 3);
+
+    //
+    // Override padding for mip levels
+    //
+    if (mipLevel > 0)
+    {
+        if (flags.cube)
+        {
+            // for cubemap, we only pad when client call with 6 faces as an identity
+            if (*pSlices > 1)
+            {
+                padDims = 3; // we should pad cubemap sub levels when we treat it as 3d texture
+            }
+            else
+            {
+                padDims = 2;
+            }
+        }
+    }
+
+    // Any possibilities that padDims is 0?
+    if (padDims == 0)
+    {
+        padDims = 3;
+    }
+
+    if (IsPow2(pitchAlign))
+    {
+        *pPitch = PowTwoAlign((*pPitch), pitchAlign);
+    }
+    else // add this code to pass unit test, r600 linear mode is not align bpp to pow2 for linear
+    {
+        *pPitch += pitchAlign - 1;
+        *pPitch /= pitchAlign;
+        *pPitch *= pitchAlign;
+    }
+
+    if (padDims > 1)
+    {
+        *pHeight = PowTwoAlign((*pHeight), heightAlign);
+    }
+
+    if (padDims > 2 || thickness > 1)
+    {
+        // for cubemap single face, we do not pad slices.
+        // if we pad it, the slice number should be set to 6 and current mip level > 1
+        if (flags.cube && (!m_configFlags.noCubeMipSlicesPad || flags.cubeAsArray))
+        {
+            *pSlices = NextPow2(*pSlices);
+        }
+
+        // normal 3D texture or arrays or cubemap has a thick mode? (Just pass unit test)
+        if (thickness > 1)
+        {
+            *pSlices = PowTwoAlign((*pSlices), sliceAlign);
+        }
+
+    }
+
+    HwlPadDimensions(tileMode,
+                     bpp,
+                     flags,
+                     numSamples,
+                     pTileInfo,
+                     padDims,
+                     mipLevel,
+                     pPitch,
+                     pitchAlign,
+                     pHeight,
+                     heightAlign,
+                     pSlices,
+                     sliceAlign);
+}
+
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlPreHandleBaseLvl3xPitch
+*
+*   @brief
+*       Pre-handler of 3x pitch (96 bit) adjustment
+*
+*   @return
+*       Expected pitch
+***************************************************************************************************
+*/
+UINT_32 AddrLib::HwlPreHandleBaseLvl3xPitch(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] input
+    UINT_32                                 expPitch    ///< [in] pitch
+    ) const
+{
+    ADDR_ASSERT(pIn->width == expPitch);
+    //
+    // If pitch is pre-multiplied by 3, we retrieve original one here to get correct miplevel size
+    //
+    if (AddrElemLib::IsExpand3x(pIn->format) &&
+        pIn->mipLevel == 0 &&
+        pIn->tileMode == ADDR_TM_LINEAR_ALIGNED)
+    {
+        expPitch /= 3;
+        expPitch = NextPow2(expPitch);
+    }
+
+    return expPitch;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlPostHandleBaseLvl3xPitch
+*
+*   @brief
+*       Post-handler of 3x pitch adjustment
+*
+*   @return
+*       Expected pitch
+***************************************************************************************************
+*/
+UINT_32 AddrLib::HwlPostHandleBaseLvl3xPitch(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] input
+    UINT_32                                 expPitch    ///< [in] pitch
+    ) const
+{
+    //
+    // 96 bits surface of sub levels require element pitch of 32 bits instead
+    // So we just return pitch in 32 bit pixels without timing 3
+    //
+    if (AddrElemLib::IsExpand3x(pIn->format) &&
+        pIn->mipLevel == 0 &&
+        pIn->tileMode == ADDR_TM_LINEAR_ALIGNED)
+    {
+        expPitch *= 3;
+    }
+
+    return expPitch;
+}
+
+
+/**
+***************************************************************************************************
+*   AddrLib::IsMacroTiled
+*
+*   @brief
+*       Check if the tile mode is macro tiled
+*
+*   @return
+*       TRUE if it is macro tiled (2D/2B/3D/3B)
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsMacroTiled(
+    AddrTileMode tileMode)  ///< [in] tile mode
+{
+   return m_modeFlags[tileMode].isMacro;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsMacro3dTiled
+*
+*   @brief
+*       Check if the tile mode is 3D macro tiled
+*
+*   @return
+*       TRUE if it is 3D macro tiled
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsMacro3dTiled(
+    AddrTileMode tileMode)  ///< [in] tile mode
+{
+    return m_modeFlags[tileMode].isMacro3d;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsMicroTiled
+*
+*   @brief
+*       Check if the tile mode is micro tiled
+*
+*   @return
+*       TRUE if micro tiled
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsMicroTiled(
+    AddrTileMode tileMode)  ///< [in] tile mode
+{
+    return m_modeFlags[tileMode].isMicro;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsLinear
+*
+*   @brief
+*       Check if the tile mode is linear
+*
+*   @return
+*       TRUE if linear
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsLinear(
+    AddrTileMode tileMode)  ///< [in] tile mode
+{
+    return m_modeFlags[tileMode].isLinear;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsPrtNoRotationTileMode
+*
+*   @brief
+*       Return TRUE if it is prt tile without rotation
+*   @note
+*       This function just used by CI
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsPrtNoRotationTileMode(
+    AddrTileMode tileMode)
+{
+    return m_modeFlags[tileMode].isPrtNoRotation;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::IsPrtTileMode
+*
+*   @brief
+*       Return TRUE if it is prt tile
+*   @note
+*       This function just used by CI
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::IsPrtTileMode(
+    AddrTileMode tileMode)
+{
+    return m_modeFlags[tileMode].isPrt;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::Bits2Number
+*
+*   @brief
+*       Cat a array of binary bit to a number
+*
+*   @return
+*       The number combined with the array of bits
+***************************************************************************************************
+*/
+UINT_32 AddrLib::Bits2Number(
+    UINT_32 bitNum,     ///< [in] how many bits
+    ...)                ///< [in] varaible bits value starting from MSB
+{
+    UINT_32 number = 0;
+    UINT_32 i;
+    va_list bits_ptr;
+
+    va_start(bits_ptr, bitNum);
+
+    for(i = 0; i < bitNum; i++)
+    {
+        number |= va_arg(bits_ptr, UINT_32);
+        number <<= 1;
+    }
+
+    number>>=1;
+
+    va_end(bits_ptr);
+
+    return number;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeMipLevel
+*
+*   @brief
+*       Compute mipmap level width/height/slices
+*   @return
+*      N/A
+***************************************************************************************************
+*/
+VOID AddrLib::ComputeMipLevel(
+    ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn ///< [in/out] Input structure
+    ) const
+{
+    if (AddrElemLib::IsBlockCompressed(pIn->format))
+    {
+        if (pIn->mipLevel == 0)
+        {
+            // DXTn's level 0 must be multiple of 4
+            // But there are exceptions:
+            // 1. Internal surface creation in hostblt/vsblt/etc...
+            // 2. Runtime doesn't reject ATI1/ATI2 whose width/height are not multiple of 4
+            pIn->width = PowTwoAlign(pIn->width, 4);
+            pIn->height = PowTwoAlign(pIn->height, 4);
+        }
+    }
+
+    HwlComputeMipLevel(pIn);
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::DegradeBaseLevel
+*
+*   @brief
+*       Check if base level's tile mode can be degraded
+*   @return
+*       TRUE if degraded, also returns degraded tile mode (unchanged if not degraded)
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::DegradeBaseLevel(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] Input structure for surface info
+    AddrTileMode*                           pTileMode   ///< [out] Degraded tile mode
+    ) const
+{
+    BOOL_32 degraded = FALSE;
+    AddrTileMode tileMode = pIn->tileMode;
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    if (m_configFlags.degradeBaseLevel) // This is a global setting
+    {
+        if (pIn->flags.degrade4Space        && // Degradation per surface
+            pIn->mipLevel == 0              &&
+            pIn->numSamples == 1            &&
+            IsMacroTiled(tileMode))
+        {
+            if (HwlDegradeBaseLevel(pIn))
+            {
+                *pTileMode = thickness == 1 ? ADDR_TM_1D_TILED_THIN1 : ADDR_TM_1D_TILED_THICK;
+                degraded = TRUE;
+            }
+            else if (thickness > 1)
+            {
+                // As in the following HwlComputeSurfaceInfo, thick modes may be degraded to
+                // thinner modes, we should re-evaluate whether the corresponding thinner modes
+                // need to be degraded. If so, we choose 1D thick mode instead.
+                tileMode = DegradeLargeThickTile(pIn->tileMode, pIn->bpp);
+                if (tileMode != pIn->tileMode)
+                {
+                    ADDR_COMPUTE_SURFACE_INFO_INPUT input = *pIn;
+                    input.tileMode = tileMode;
+                    if (HwlDegradeBaseLevel(&input))
+                    {
+                        *pTileMode = ADDR_TM_1D_TILED_THICK;
+                        degraded = TRUE;
+                    }
+                }
+            }
+        }
+    }
+
+    return degraded;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::DegradeLargeThickTile
+*
+*   @brief
+*       Check if the thickness needs to be reduced if a tile is too large
+*   @return
+*       The degraded tile mode (unchanged if not degraded)
+***************************************************************************************************
+*/
+AddrTileMode AddrLib::DegradeLargeThickTile(
+    AddrTileMode tileMode,
+    UINT_32 bpp) const
+{
+    // Override tilemode
+    // When tile_width (8) * tile_height (8) * thickness * element_bytes is > row_size,
+    // it is better to just use THIN mode in this case
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    if (thickness > 1 && m_configFlags.allowLargeThickTile == 0)
+    {
+        UINT_32 tileSize = MicroTilePixels * thickness * (bpp >> 3);
+
+        if (tileSize > m_rowSize)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_2D_TILED_XTHICK:
+                    if ((tileSize >> 1) <= m_rowSize)
+                    {
+                        tileMode = ADDR_TM_2D_TILED_THICK;
+                        break;
+                    }
+                    // else fall through
+                case ADDR_TM_2D_TILED_THICK:
+                    tileMode    = ADDR_TM_2D_TILED_THIN1;
+                    break;
+
+                case ADDR_TM_3D_TILED_XTHICK:
+                    if ((tileSize >> 1) <= m_rowSize)
+                    {
+                        tileMode = ADDR_TM_3D_TILED_THICK;
+                        break;
+                    }
+                    // else fall through
+                case ADDR_TM_3D_TILED_THICK:
+                    tileMode    = ADDR_TM_3D_TILED_THIN1;
+                    break;
+
+                case ADDR_TM_PRT_TILED_THICK:
+                    tileMode    = ADDR_TM_PRT_TILED_THIN1;
+                    break;
+
+                case ADDR_TM_PRT_2D_TILED_THICK:
+                    tileMode    = ADDR_TM_PRT_2D_TILED_THIN1;
+                    break;
+
+                case ADDR_TM_PRT_3D_TILED_THICK:
+                    tileMode    = ADDR_TM_PRT_3D_TILED_THIN1;
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+
+    return tileMode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::PostComputeMipLevel
+*   @brief
+*       Compute MipLevel info (including level 0) after surface adjustment
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::PostComputeMipLevel(
+    ADDR_COMPUTE_SURFACE_INFO_INPUT*    pIn,   ///< [in/out] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*   pOut   ///< [out] Output structure
+    ) const
+{
+    // Mipmap including level 0 must be pow2 padded since either SI hw expects so or it is
+    // required by CFX  for Hw Compatibility between NI and SI. Otherwise it is only needed for
+    // mipLevel > 0. Any h/w has different requirement should implement its own virtual function
+
+    if (pIn->flags.pow2Pad)
+    {
+        pIn->width      = NextPow2(pIn->width);
+        pIn->height     = NextPow2(pIn->height);
+        pIn->numSlices  = NextPow2(pIn->numSlices);
+    }
+    else if (pIn->mipLevel > 0)
+    {
+        pIn->width      = NextPow2(pIn->width);
+        pIn->height     = NextPow2(pIn->height);
+
+        if (!pIn->flags.cube)
+        {
+            pIn->numSlices = NextPow2(pIn->numSlices);
+        }
+
+        // for cubemap, we keep its value at first
+    }
+
+    return ADDR_OK;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlSetupTileCfg
+*
+*   @brief
+*       Map tile index to tile setting.
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::HwlSetupTileCfg(
+    INT_32          index,            ///< [in] Tile index
+    INT_32          macroModeIndex,   ///< [in] Index in macro tile mode table(CI)
+    ADDR_TILEINFO*  pInfo,            ///< [out] Tile Info
+    AddrTileMode*   pMode,            ///< [out] Tile mode
+    AddrTileType*   pType             ///< [out] Tile type
+    ) const
+{
+    return ADDR_NOTSUPPORTED;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::HwlGetPipes
+*
+*   @brief
+*       Get number pipes
+*   @return
+*       num pipes
+***************************************************************************************************
+*/
+UINT_32 AddrLib::HwlGetPipes(
+    const ADDR_TILEINFO* pTileInfo    ///< [in] Tile info
+    ) const
+{
+    //pTileInfo can be NULL when asic is 6xx and 8xx.
+    return m_pipes;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputeQbStereoInfo
+*
+*   @brief
+*       Get quad buffer stereo information
+*   @return
+*       TRUE if no error
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::ComputeQbStereoInfo(
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [in/out] updated pOut+pStereoInfo
+    ) const
+{
+    BOOL_32 success = FALSE;
+
+    if (pOut->pStereoInfo)
+    {
+        ADDR_ASSERT(pOut->bpp >= 8);
+        ADDR_ASSERT((pOut->surfSize % pOut->baseAlign) == 0);
+
+        // Save original height
+        pOut->pStereoInfo->eyeHeight = pOut->height;
+
+        // Right offset
+        pOut->pStereoInfo->rightOffset = static_cast<UINT_32>(pOut->surfSize);
+
+        pOut->pStereoInfo->rightSwizzle = HwlComputeQbStereoRightSwizzle(pOut);
+        // Double height
+        pOut->height <<= 1;
+        pOut->pixelHeight <<= 1;
+
+        // Double size
+        pOut->surfSize <<= 1;
+
+        // Right start address meets the base align since it is guaranteed by AddrLib
+
+        // 1D surface on SI may break this rule, but we can force it to meet by checking .qbStereo.
+        success = TRUE;
+    }
+
+    return success;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                               Element lib
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/**
+***************************************************************************************************
+*   AddrLib::Flt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a depth/stencil pixel value
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::Flt32ToDepthPixel(
+    const ELEM_FLT32TODEPTHPIXEL_INPUT* pIn,
+    ELEM_FLT32TODEPTHPIXEL_OUTPUT* pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ELEM_FLT32TODEPTHPIXEL_INPUT)) ||
+            (pOut->size != sizeof(ELEM_FLT32TODEPTHPIXEL_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        GetElemLib()->Flt32ToDepthPixel(pIn->format,
+                                        pIn->comps,
+                                        pOut->pPixel);
+        UINT_32 depthBase = 0;
+        UINT_32 stencilBase = 0;
+        UINT_32 depthBits = 0;
+        UINT_32 stencilBits = 0;
+
+        switch (pIn->format)
+        {
+            case ADDR_DEPTH_16:
+                depthBits = 16;
+                break;
+            case ADDR_DEPTH_X8_24:
+            case ADDR_DEPTH_8_24:
+            case ADDR_DEPTH_X8_24_FLOAT:
+            case ADDR_DEPTH_8_24_FLOAT:
+                depthBase = 8;
+                depthBits = 24;
+                stencilBits = 8;
+                break;
+            case ADDR_DEPTH_32_FLOAT:
+                depthBits = 32;
+                break;
+            case ADDR_DEPTH_X24_8_32_FLOAT:
+                depthBase = 8;
+                depthBits = 32;
+                stencilBits = 8;
+                break;
+            default:
+                break;
+        }
+
+        // Overwrite base since R800 has no "tileBase"
+        if (GetElemLib()->IsDepthStencilTilePlanar() == FALSE)
+        {
+            depthBase = 0;
+            stencilBase = 0;
+        }
+
+        depthBase *= 64;
+        stencilBase *= 64;
+
+        pOut->stencilBase = stencilBase;
+        pOut->depthBase = depthBase;
+        pOut->depthBits = depthBits;
+        pOut->stencilBits = stencilBits;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::Flt32ToColorPixel
+*
+*   @brief
+*       Convert a FLT_32 value to a red/green/blue/alpha pixel value
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::Flt32ToColorPixel(
+    const ELEM_FLT32TOCOLORPIXEL_INPUT* pIn,
+    ELEM_FLT32TOCOLORPIXEL_OUTPUT* pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if ((pIn->size != sizeof(ELEM_FLT32TOCOLORPIXEL_INPUT)) ||
+            (pOut->size != sizeof(ELEM_FLT32TOCOLORPIXEL_OUTPUT)))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        GetElemLib()->Flt32ToColorPixel(pIn->format,
+                                        pIn->surfNum,
+                                        pIn->surfSwap,
+                                        pIn->comps,
+                                        pOut->pPixel);
+    }
+
+    return returnCode;
+}
+
+
+/**
+***************************************************************************************************
+*   AddrLib::GetExportNorm
+*
+*   @brief
+*       Check one format can be EXPORT_NUM
+*   @return
+*       TRUE if EXPORT_NORM can be used
+***************************************************************************************************
+*/
+BOOL_32 AddrLib::GetExportNorm(
+    const ELEM_GETEXPORTNORM_INPUT* pIn) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    BOOL_32 enabled = FALSE;
+
+    if (GetFillSizeFieldsFlags() == TRUE)
+    {
+        if (pIn->size != sizeof(ELEM_GETEXPORTNORM_INPUT))
+        {
+            returnCode = ADDR_PARAMSIZEMISMATCH;
+        }
+    }
+
+    if (returnCode == ADDR_OK)
+    {
+        enabled = GetElemLib()->PixGetExportNorm(pIn->format,
+                                                 pIn->num,
+                                                 pIn->swap);
+    }
+
+    return enabled;
+}
+
+/**
+***************************************************************************************************
+*   AddrLib::ComputePrtInfo
+*
+*   @brief
+*       Compute prt surface related info
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE AddrLib::ComputePrtInfo(
+    const ADDR_PRT_INFO_INPUT*  pIn,
+    ADDR_PRT_INFO_OUTPUT*       pOut) const
+{
+    ADDR_ASSERT(pOut != NULL);
+
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    UINT_32     expandX = 1;
+    UINT_32     expandY = 1;
+    AddrElemMode elemMode;
+
+    UINT_32     bpp = GetElemLib()->GetBitsPerPixel(pIn->format,
+                                                &elemMode,
+                                                &expandX,
+                                                &expandY);
+
+    if (bpp <8 || bpp == 24 || bpp == 48 || bpp == 96 )
+    {
+        returnCode = ADDR_INVALIDPARAMS;
+    }
+
+    UINT_32     numFrags = pIn->numFrags;
+    ADDR_ASSERT(numFrags <= 8);
+
+    UINT_32     tileWidth = 0;
+    UINT_32     tileHeight = 0;
+    if (returnCode == ADDR_OK)
+    {
+        // 3D texture without depth or 2d texture
+        if (pIn->baseMipDepth > 1 || pIn->baseMipHeight > 1)
+        {
+            if (bpp == 8)
+            {
+                tileWidth = 256;
+                tileHeight = 256;
+            }
+            else if (bpp == 16)
+            {
+                tileWidth = 256;
+                tileHeight = 128;
+            }
+            else if (bpp == 32)
+            {
+                tileWidth = 128;
+                tileHeight = 128;
+            }
+            else if (bpp == 64)
+            {
+                // assume it is BC1/4
+                tileWidth = 512;
+                tileHeight = 256;
+
+                if (elemMode == ADDR_UNCOMPRESSED)
+                {
+                    tileWidth = 128;
+                    tileHeight = 64;
+                }
+            }
+            else if (bpp == 128)
+            {
+                // assume it is BC2/3/5/6H/7
+                tileWidth = 256;
+                tileHeight = 256;
+
+                if (elemMode == ADDR_UNCOMPRESSED)
+                {
+                    tileWidth = 64;
+                    tileHeight = 64;
+                }
+            }
+
+            if (numFrags == 2)
+            {
+                tileWidth = tileWidth / 2;
+            }
+            else if (numFrags == 4)
+            {
+                tileWidth = tileWidth / 2;
+                tileHeight = tileHeight / 2;
+            }
+            else if (numFrags == 8)
+            {
+                tileWidth = tileWidth / 4;
+                tileHeight = tileHeight / 2;
+            }
+        }
+        else    // 1d
+        {
+            tileHeight = 1;
+            if (bpp == 8)
+            {
+                tileWidth = 65536;
+            }
+            else if (bpp == 16)
+            {
+                tileWidth = 32768;
+            }
+            else if (bpp == 32)
+            {
+                tileWidth = 16384;
+            }
+            else if (bpp == 64)
+            {
+                tileWidth = 8192;
+            }
+            else if (bpp == 128)
+            {
+                tileWidth = 4096;
+            }
+        }
+    }
+
+    pOut->prtTileWidth = tileWidth;
+    pOut->prtTileHeight = tileHeight;
+
+    return returnCode;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.h
new file mode 100644
index 00000000000..43c55ff32ff
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrlib.h
@@ -0,0 +1,695 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrlib.h
+* @brief Contains the AddrLib base class definition.
+***************************************************************************************************
+*/
+
+#ifndef __ADDR_LIB_H__
+#define __ADDR_LIB_H__
+
+
+#include "addrinterface.h"
+#include "addrobject.h"
+#include "addrelemlib.h"
+
+#if BRAHMA_BUILD
+#include "amdgpu_id.h"
+#else
+#include "atiid.h"
+#endif
+
+#ifndef CIASICIDGFXENGINE_R600
+#define CIASICIDGFXENGINE_R600 0x00000006
+#endif
+
+#ifndef CIASICIDGFXENGINE_R800
+#define CIASICIDGFXENGINE_R800 0x00000008
+#endif
+
+#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND
+#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A
+#endif
+
+#ifndef CIASICIDGFXENGINE_SEAISLAND
+#define CIASICIDGFXENGINE_SEAISLAND 0x0000000B
+#endif
+/**
+***************************************************************************************************
+* @brief Neutral enums that define pipeinterleave
+***************************************************************************************************
+*/
+enum AddrPipeInterleave
+{
+    ADDR_PIPEINTERLEAVE_256B = 256,
+    ADDR_PIPEINTERLEAVE_512B = 512,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define DRAM row size
+***************************************************************************************************
+*/
+enum AddrRowSize
+{
+    ADDR_ROWSIZE_1KB = 1024,
+    ADDR_ROWSIZE_2KB = 2048,
+    ADDR_ROWSIZE_4KB = 4096,
+    ADDR_ROWSIZE_8KB = 8192,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define bank interleave
+***************************************************************************************************
+*/
+enum AddrBankInterleave
+{
+    ADDR_BANKINTERLEAVE_1 = 1,
+    ADDR_BANKINTERLEAVE_2 = 2,
+    ADDR_BANKINTERLEAVE_4 = 4,
+    ADDR_BANKINTERLEAVE_8 = 8,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define MGPU chip tile size
+***************************************************************************************************
+*/
+enum AddrChipTileSize
+{
+    ADDR_CHIPTILESIZE_16 = 16,
+    ADDR_CHIPTILESIZE_32 = 32,
+    ADDR_CHIPTILESIZE_64 = 64,
+    ADDR_CHIPTILESIZE_128 = 128,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define shader engine tile size
+***************************************************************************************************
+*/
+enum AddrEngTileSize
+{
+    ADDR_SE_TILESIZE_16 = 16,
+    ADDR_SE_TILESIZE_32 = 32,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define bank swap size
+***************************************************************************************************
+*/
+enum AddrBankSwapSize
+{
+    ADDR_BANKSWAP_128B = 128,
+    ADDR_BANKSWAP_256B = 256,
+    ADDR_BANKSWAP_512B = 512,
+    ADDR_BANKSWAP_1KB = 1024,
+};
+
+/**
+***************************************************************************************************
+* @brief Neutral enums that define bank swap size
+***************************************************************************************************
+*/
+enum AddrSampleSplitSize
+{
+    ADDR_SAMPLESPLIT_1KB = 1024,
+    ADDR_SAMPLESPLIT_2KB = 2048,
+    ADDR_SAMPLESPLIT_4KB = 4096,
+    ADDR_SAMPLESPLIT_8KB = 8192,
+};
+
+/**
+***************************************************************************************************
+* @brief Flags for AddrTileMode
+***************************************************************************************************
+*/
+struct AddrTileModeFlags
+{
+    UINT_32 thickness       : 4;
+    UINT_32 isLinear        : 1;
+    UINT_32 isMicro         : 1;
+    UINT_32 isMacro         : 1;
+    UINT_32 isMacro3d       : 1;
+    UINT_32 isPrt           : 1;
+    UINT_32 isPrtNoRotation : 1;
+    UINT_32 isBankSwapped   : 1;
+};
+
+/**
+***************************************************************************************************
+* @brief This class contains asic independent address lib functionalities
+***************************************************************************************************
+*/
+class AddrLib : public AddrObject
+{
+public:
+    virtual ~AddrLib();
+
+    static ADDR_E_RETURNCODE Create(
+        const ADDR_CREATE_INPUT* pCreateInfo, ADDR_CREATE_OUTPUT* pCreateOut);
+
+    /// Pair of Create
+    VOID Destroy()
+    {
+        delete this;
+    }
+
+    static AddrLib* GetAddrLib(
+        ADDR_HANDLE hLib);
+
+    /// Returns AddrLib version (from compiled binary instead include file)
+    UINT_32 GetVersion()
+    {
+        return m_version;
+    }
+
+    /// Returns asic chip family name defined by AddrLib
+    AddrChipFamily GetAddrChipFamily()
+    {
+        return m_chipFamily;
+    }
+
+    /// Returns tileIndex support
+    BOOL_32 UseTileIndex(INT_32 index) const
+    {
+        return m_configFlags.useTileIndex && (index != TileIndexInvalid);
+    }
+
+    /// Returns combined swizzle support
+    BOOL_32 UseCombinedSwizzle() const
+    {
+        return m_configFlags.useCombinedSwizzle;
+    }
+
+    //
+    // Interface stubs
+    //
+    ADDR_E_RETURNCODE ComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeSurfaceAddrFromCoord(
+        const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeSurfaceCoordFromAddr(
+        const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT*  pIn,
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeSliceTileSwizzle(
+        const ADDR_COMPUTE_SLICESWIZZLE_INPUT*  pIn,
+        ADDR_COMPUTE_SLICESWIZZLE_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ExtractBankPipeSwizzle(
+        const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT* pIn,
+        ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE CombineBankPipeSwizzle(
+        const ADDR_COMBINE_BANKPIPE_SWIZZLE_INPUT*  pIn,
+        ADDR_COMBINE_BANKPIPE_SWIZZLE_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeBaseSwizzle(
+        const ADDR_COMPUTE_BASE_SWIZZLE_INPUT*  pIn,
+        ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT*  pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut);
+
+    ADDR_E_RETURNCODE ComputeFmaskAddrFromCoord(
+        const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*  pIn,
+        ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeFmaskCoordFromAddr(
+        const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*  pIn,
+        ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ConvertTileInfoToHW(
+        const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn,
+        ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ConvertTileIndex(
+        const ADDR_CONVERT_TILEINDEX_INPUT* pIn,
+        ADDR_CONVERT_TILEINDEX_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ConvertTileIndex1(
+        const ADDR_CONVERT_TILEINDEX1_INPUT* pIn,
+        ADDR_CONVERT_TILEINDEX_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE GetTileIndex(
+        const ADDR_GET_TILEINDEX_INPUT* pIn,
+        ADDR_GET_TILEINDEX_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeHtileInfo(
+        const ADDR_COMPUTE_HTILE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_HTILE_INFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeCmaskInfo(
+        const ADDR_COMPUTE_CMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_CMASK_INFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeDccInfo(
+        const ADDR_COMPUTE_DCCINFO_INPUT* pIn,
+        ADDR_COMPUTE_DCCINFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeHtileAddrFromCoord(
+        const ADDR_COMPUTE_HTILE_ADDRFROMCOORD_INPUT*  pIn,
+        ADDR_COMPUTE_HTILE_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeCmaskAddrFromCoord(
+        const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*  pIn,
+        ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeHtileCoordFromAddr(
+        const ADDR_COMPUTE_HTILE_COORDFROMADDR_INPUT*  pIn,
+        ADDR_COMPUTE_HTILE_COORDFROMADDR_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputeCmaskCoordFromAddr(
+        const ADDR_COMPUTE_CMASK_COORDFROMADDR_INPUT*  pIn,
+        ADDR_COMPUTE_CMASK_COORDFROMADDR_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE ComputePrtInfo(
+        const ADDR_PRT_INFO_INPUT*  pIn,
+        ADDR_PRT_INFO_OUTPUT*       pOut) const;
+
+    ADDR_E_RETURNCODE Flt32ToDepthPixel(
+        const ELEM_FLT32TODEPTHPIXEL_INPUT* pIn,
+        ELEM_FLT32TODEPTHPIXEL_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE Flt32ToColorPixel(
+        const ELEM_FLT32TOCOLORPIXEL_INPUT* pIn,
+        ELEM_FLT32TOCOLORPIXEL_OUTPUT* pOut) const;
+
+    BOOL_32 GetExportNorm(
+        const ELEM_GETEXPORTNORM_INPUT* pIn) const;
+
+protected:
+    AddrLib();  // Constructor is protected
+    AddrLib(const AddrClient* pClient);
+
+    /// Pure Virtual function for Hwl computing surface info
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl computing surface address from coord
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceAddrFromCoord(
+        const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl computing surface coord from address
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceCoordFromAddr(
+        const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl computing surface tile swizzle
+    virtual ADDR_E_RETURNCODE HwlComputeSliceTileSwizzle(
+        const ADDR_COMPUTE_SLICESWIZZLE_INPUT* pIn,
+        ADDR_COMPUTE_SLICESWIZZLE_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl extracting bank/pipe swizzle from base256b
+    virtual ADDR_E_RETURNCODE HwlExtractBankPipeSwizzle(
+        const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT* pIn,
+        ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl combining bank/pipe swizzle
+    virtual ADDR_E_RETURNCODE HwlCombineBankPipeSwizzle(
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, ADDR_TILEINFO*  pTileInfo,
+        UINT_64 baseAddr, UINT_32* pTileSwizzle) const = 0;
+
+    /// Pure Virtual function for Hwl computing base swizzle
+    virtual ADDR_E_RETURNCODE HwlComputeBaseSwizzle(
+        const ADDR_COMPUTE_BASE_SWIZZLE_INPUT* pIn,
+        ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl computing HTILE base align
+    virtual UINT_32 HwlComputeHtileBaseAlign(
+        BOOL_32 isTcCompatible, BOOL_32 isLinear, ADDR_TILEINFO* pTileInfo) const = 0;
+
+    /// Pure Virtual function for Hwl computing HTILE bpp
+    virtual UINT_32 HwlComputeHtileBpp(
+        BOOL_32 isWidth8, BOOL_32 isHeight8) const = 0;
+
+    /// Pure Virtual function for Hwl computing HTILE bytes
+    virtual UINT_64 HwlComputeHtileBytes(
+        UINT_32 pitch, UINT_32 height, UINT_32 bpp,
+        BOOL_32 isLinear, UINT_32 numSlices, UINT_64* pSliceBytes, UINT_32 baseAlign) const = 0;
+
+    /// Pure Virtual function for Hwl computing FMASK info
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut) = 0;
+
+    /// Pure Virtual function for Hwl FMASK address from coord
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskAddrFromCoord(
+        const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl FMASK coord from address
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskCoordFromAddr(
+        const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl convert tile info from real value to HW value
+    virtual ADDR_E_RETURNCODE HwlConvertTileInfoToHW(
+        const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn,
+        ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut) const = 0;
+
+    /// Pure Virtual function for Hwl compute mipmap info
+    virtual BOOL_32 HwlComputeMipLevel(
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const = 0;
+
+    /// Pure Virtual function for Hwl compute max cmask blockMax value
+    virtual BOOL_32 HwlGetMaxCmaskBlockMax() const = 0;
+
+    /// Pure Virtual function for Hwl compute fmask bits
+    virtual UINT_32 HwlComputeFmaskBits(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        UINT_32* pNumSamples) const = 0;
+
+    /// Virtual function to get index (not pure then no need to implement this in all hwls
+    virtual ADDR_E_RETURNCODE HwlGetTileIndex(
+        const ADDR_GET_TILEINDEX_INPUT* pIn,
+        ADDR_GET_TILEINDEX_OUTPUT*      pOut) const
+    {
+        return ADDR_NOTSUPPORTED;
+    }
+
+    /// Virtual function for Hwl to compute Dcc info
+    virtual ADDR_E_RETURNCODE HwlComputeDccInfo(
+        const ADDR_COMPUTE_DCCINFO_INPUT* pIn,
+        ADDR_COMPUTE_DCCINFO_OUTPUT* pOut) const
+    {
+        return ADDR_NOTSUPPORTED;
+    }
+
+    /// Virtual function to get cmask address for tc compatible cmask
+    virtual ADDR_E_RETURNCODE HwlComputeCmaskAddrFromCoord(
+        const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT* pOut) const
+    {
+        return ADDR_NOTSUPPORTED;
+    }
+    // Compute attributes
+
+    // HTILE
+    UINT_32    ComputeHtileInfo(
+        ADDR_HTILE_FLAGS flags,
+        UINT_32 pitchIn, UINT_32 heightIn, UINT_32 numSlices,
+        BOOL_32 isLinear, BOOL_32 isWidth8, BOOL_32 isHeight8,
+        ADDR_TILEINFO*  pTileInfo,
+        UINT_32* pPitchOut, UINT_32* pHeightOut, UINT_64* pHtileBytes,
+        UINT_32* pMacroWidth = NULL, UINT_32* pMacroHeight = NULL,
+        UINT_64* pSliceSize = NULL, UINT_32* pBaseAlign = NULL) const;
+
+    // CMASK
+    ADDR_E_RETURNCODE ComputeCmaskInfo(
+        ADDR_CMASK_FLAGS flags,
+        UINT_32 pitchIn, UINT_32 heightIn, UINT_32 numSlices, BOOL_32 isLinear,
+        ADDR_TILEINFO* pTileInfo, UINT_32* pPitchOut, UINT_32* pHeightOut, UINT_64* pCmaskBytes,
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight, UINT_64* pSliceSize = NULL,
+        UINT_32* pBaseAlign = NULL, UINT_32* pBlockMax = NULL) const;
+
+    virtual VOID HwlComputeTileDataWidthAndHeightLinear(
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight,
+        UINT_32 bpp, ADDR_TILEINFO* pTileInfo) const;
+
+    // CMASK & HTILE addressing
+    virtual UINT_64 HwlComputeXmaskAddrFromCoord(
+        UINT_32 pitch, UINT_32 height, UINT_32 x, UINT_32 y, UINT_32 slice,
+        UINT_32 numSlices, UINT_32 factor, BOOL_32 isLinear, BOOL_32 isWidth8,
+        BOOL_32 isHeight8, ADDR_TILEINFO* pTileInfo,
+        UINT_32* bitPosition) const;
+
+    virtual VOID HwlComputeXmaskCoordFromAddr(
+        UINT_64 addr, UINT_32 bitPosition, UINT_32 pitch, UINT_32 height, UINT_32 numSlices,
+        UINT_32 factor, BOOL_32 isLinear, BOOL_32 isWidth8, BOOL_32 isHeight8,
+        ADDR_TILEINFO* pTileInfo, UINT_32* pX, UINT_32* pY, UINT_32* pSlice) const;
+
+    // Surface mipmap
+    VOID    ComputeMipLevel(
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const;
+
+    /// Pure Virtual function for Hwl checking degrade for base level
+    virtual BOOL_32 HwlDegradeBaseLevel(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const = 0;
+
+    virtual BOOL_32 HwlOverrideTileMode(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        AddrTileMode* pTileMode,
+        AddrTileType* pTileType) const
+    {
+        // not supported in hwl layer, FALSE for not-overrided
+        return FALSE;
+    }
+
+    AddrTileMode DegradeLargeThickTile(AddrTileMode tileMode, UINT_32 bpp) const;
+
+    VOID PadDimensions(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32 numSamples, ADDR_TILEINFO* pTileInfo, UINT_32 padDims, UINT_32 mipLevel,
+        UINT_32* pPitch, UINT_32 pitchAlign, UINT_32* pHeight, UINT_32 heightAlign,
+        UINT_32* pSlices, UINT_32 sliceAlign) const;
+
+    virtual VOID HwlPadDimensions(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32 numSamples, ADDR_TILEINFO* pTileInfo, UINT_32 padDims, UINT_32 mipLevel,
+        UINT_32* pPitch, UINT_32 pitchAlign, UINT_32* pHeight, UINT_32 heightAlign,
+        UINT_32* pSlices, UINT_32 sliceAlign) const
+    {
+    }
+
+    //
+    // Addressing shared for linear/1D tiling
+    //
+    UINT_64 ComputeSurfaceAddrFromCoordLinear(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSlices,
+        UINT_32* pBitPosition) const;
+
+    VOID    ComputeSurfaceCoordFromAddrLinear(
+        UINT_64 addr, UINT_32 bitPosition, UINT_32 bpp,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSlices,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample) const;
+
+    VOID    ComputeSurfaceCoordFromAddrMicroTiled(
+        UINT_64 addr, UINT_32 bitPosition,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 tileBase, UINT_32 compBits,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample,
+        AddrTileType microTileType, BOOL_32 isDepthSampleOrder) const;
+
+    UINT_32 ComputePixelIndexWithinMicroTile(
+        UINT_32 x, UINT_32 y, UINT_32 z,
+        UINT_32 bpp, AddrTileMode tileMode, AddrTileType microTileType) const;
+
+    /// Pure Virtual function for Hwl computing coord from offset inside micro tile
+    virtual VOID HwlComputePixelCoordFromOffset(
+        UINT_32 offset, UINT_32 bpp, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 tileBase, UINT_32 compBits,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample,
+        AddrTileType microTileType, BOOL_32 isDepthSampleOrder) const = 0;
+
+    //
+    // Addressing shared by all
+    //
+    virtual UINT_32 HwlGetPipes(
+        const ADDR_TILEINFO* pTileInfo) const;
+
+    UINT_32 ComputePipeFromAddr(
+        UINT_64 addr, UINT_32 numPipes) const;
+
+    /// Pure Virtual function for Hwl computing pipe from coord
+    virtual UINT_32 ComputePipeFromCoord(
+        UINT_32 x, UINT_32 y, UINT_32 slice, AddrTileMode tileMode,
+        UINT_32 pipeSwizzle, BOOL_32 flags, ADDR_TILEINFO* pTileInfo) const = 0;
+
+    /// Pure Virtual function for Hwl computing coord Y for 8 pipe cmask/htile
+    virtual UINT_32 HwlComputeXmaskCoordYFrom8Pipe(
+        UINT_32 pipe, UINT_32 x) const = 0;
+
+    //
+    // Initialization
+    //
+    /// Pure Virtual function for Hwl computing internal global parameters from h/w registers
+    virtual BOOL_32 HwlInitGlobalParams(
+        const ADDR_CREATE_INPUT* pCreateIn) = 0;
+
+    /// Pure Virtual function for Hwl converting chip family
+    virtual AddrChipFamily HwlConvertChipFamily(UINT_32 uChipFamily, UINT_32 uChipRevision) = 0;
+
+    //
+    // Misc helper
+    //
+    static const AddrTileModeFlags m_modeFlags[ADDR_TM_COUNT];
+
+    static UINT_32 ComputeSurfaceThickness(
+        AddrTileMode tileMode);
+
+    // Checking tile mode
+    static BOOL_32 IsMacroTiled(AddrTileMode tileMode);
+    static BOOL_32 IsMacro3dTiled(AddrTileMode tileMode);
+    static BOOL_32 IsLinear(AddrTileMode tileMode);
+    static BOOL_32 IsMicroTiled(AddrTileMode tileMode);
+    static BOOL_32 IsPrtTileMode(AddrTileMode tileMode);
+    static BOOL_32 IsPrtNoRotationTileMode(AddrTileMode tileMode);
+
+    static UINT_32 Bits2Number(UINT_32 bitNum,...);
+
+    static UINT_32 GetNumFragments(UINT_32 numSamples, UINT_32 numFrags)
+    {
+        return numFrags != 0 ? numFrags : Max(1u, numSamples);
+    }
+
+    /// Returns pointer of AddrElemLib
+    AddrElemLib* GetElemLib() const
+    {
+        return m_pElemLib;
+    }
+
+    /// Return TRUE if tile info is needed
+    BOOL_32 UseTileInfo() const
+    {
+        return !m_configFlags.ignoreTileInfo;
+    }
+
+    /// Returns fillSizeFields flag
+    UINT_32 GetFillSizeFieldsFlags() const
+    {
+        return m_configFlags.fillSizeFields;
+    }
+
+    /// Adjusts pitch alignment for flipping surface
+    VOID    AdjustPitchAlignment(
+        ADDR_SURFACE_FLAGS flags, UINT_32* pPitchAlign) const;
+
+    /// Overwrite tile config according to tile index
+    virtual ADDR_E_RETURNCODE HwlSetupTileCfg(
+        INT_32 index, INT_32 macroModeIndex,
+        ADDR_TILEINFO* pInfo, AddrTileMode* mode = NULL, AddrTileType* type = NULL) const;
+
+    /// Overwrite macro tile config according to tile index
+    virtual INT_32 HwlComputeMacroModeIndex(
+        INT_32 index, ADDR_SURFACE_FLAGS flags, UINT_32 bpp, UINT_32 numSamples,
+        ADDR_TILEINFO* pTileInfo, AddrTileMode *pTileMode = NULL, AddrTileType *pTileType = NULL
+        ) const
+    {
+        return TileIndexNoMacroIndex;
+    }
+
+    /// Pre-handler of 3x pitch (96 bit) adjustment
+    virtual UINT_32 HwlPreHandleBaseLvl3xPitch(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, UINT_32 expPitch) const;
+    /// Post-handler of 3x pitch adjustment
+    virtual UINT_32 HwlPostHandleBaseLvl3xPitch(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, UINT_32 expPitch) const;
+    /// Check miplevel after surface adjustment
+    ADDR_E_RETURNCODE PostComputeMipLevel(
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    /// Quad buffer stereo support, has its implementation in ind. layer
+    virtual BOOL_32 ComputeQbStereoInfo(
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    /// Pure virutual function to compute stereo bank swizzle for right eye
+    virtual UINT_32 HwlComputeQbStereoRightSwizzle(
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const = 0;
+
+private:
+    // Disallow the copy constructor
+    AddrLib(const AddrLib& a);
+
+    // Disallow the assignment operator
+    AddrLib& operator=(const AddrLib& a);
+
+    VOID SetAddrChipFamily(UINT_32 uChipFamily, UINT_32 uChipRevision);
+
+    UINT_32 ComputeCmaskBaseAlign(
+        ADDR_CMASK_FLAGS flags, ADDR_TILEINFO*  pTileInfo) const;
+
+    UINT_64 ComputeCmaskBytes(
+        UINT_32 pitch, UINT_32 height, UINT_32 numSlices) const;
+
+    //
+    // CMASK/HTILE shared methods
+    //
+    VOID    ComputeTileDataWidthAndHeight(
+        UINT_32 bpp, UINT_32 cacheBits, ADDR_TILEINFO* pTileInfo,
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight) const;
+
+    UINT_32 ComputeXmaskCoordYFromPipe(
+        UINT_32 pipe, UINT_32 x) const;
+
+    VOID SetMinPitchAlignPixels(UINT_32 minPitchAlignPixels);
+
+    BOOL_32 DegradeBaseLevel(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, AddrTileMode* pTileMode) const;
+
+protected:
+    AddrLibClass        m_class;        ///< Store class type (HWL type)
+
+    AddrChipFamily      m_chipFamily;   ///< Chip family translated from the one in atiid.h
+
+    UINT_32             m_chipRevision; ///< Revision id from xxx_id.h
+
+    UINT_32             m_version;      ///< Current version
+
+    //
+    // Global parameters
+    //
+    ADDR_CONFIG_FLAGS   m_configFlags;  ///< Global configuration flags. Note this is setup by
+                                        ///  AddrLib instead of Client except forceLinearAligned
+
+    UINT_32             m_pipes;        ///< Number of pipes
+    UINT_32             m_banks;        ///< Number of banks
+                                        ///  For r800 this is MC_ARB_RAMCFG.NOOFBANK
+                                        ///  Keep it here to do default parameter calculation
+
+    UINT_32             m_pipeInterleaveBytes;
+                                        ///< Specifies the size of contiguous address space
+                                        ///  within each tiling pipe when making linear
+                                        ///  accesses. (Formerly Group Size)
+
+    UINT_32             m_rowSize;      ///< DRAM row size, in bytes
+
+    UINT_32             m_minPitchAlignPixels; ///< Minimum pitch alignment in pixels
+    UINT_32             m_maxSamples;   ///< Max numSamples
+private:
+    AddrElemLib*        m_pElemLib;     ///< Element Lib pointer
+};
+
+AddrLib* AddrSIHwlInit  (const AddrClient* pClient);
+AddrLib* AddrCIHwlInit  (const AddrClient* pClient);
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.cpp
new file mode 100644
index 00000000000..863a252fcf1
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrobject.cpp
+* @brief Contains the AddrObject base class implementation.
+***************************************************************************************************
+*/
+
+#include "addrinterface.h"
+#include "addrobject.h"
+
+/**
+***************************************************************************************************
+*   AddrObject::AddrObject
+*
+*   @brief
+*       Constructor for the AddrObject class.
+***************************************************************************************************
+*/
+AddrObject::AddrObject()
+{
+    m_client.handle = NULL;
+    m_client.callbacks.allocSysMem = NULL;
+    m_client.callbacks.freeSysMem = NULL;
+    m_client.callbacks.debugPrint = NULL;
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::AddrObject
+*
+*   @brief
+*       Constructor for the AddrObject class.
+***************************************************************************************************
+*/
+AddrObject::AddrObject(const AddrClient* pClient)
+{
+    m_client = *pClient;
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::~AddrObject
+*
+*   @brief
+*       Destructor for the AddrObject class.
+***************************************************************************************************
+*/
+AddrObject::~AddrObject()
+{
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::ClientAlloc
+*
+*   @brief
+*       Calls instanced allocSysMem inside AddrClient
+***************************************************************************************************
+*/
+VOID* AddrObject::ClientAlloc(
+    size_t             objSize,    ///< [in] Size to allocate
+    const AddrClient*  pClient)    ///< [in] Client pointer
+{
+    VOID* pObjMem = NULL;
+
+    if (pClient->callbacks.allocSysMem != NULL)
+    {
+        ADDR_ALLOCSYSMEM_INPUT allocInput = {0};
+
+        allocInput.size        = sizeof(ADDR_ALLOCSYSMEM_INPUT);
+        allocInput.flags.value = 0;
+        allocInput.sizeInBytes = static_cast<UINT_32>(objSize);
+        allocInput.hClient     = pClient->handle;
+
+        pObjMem = pClient->callbacks.allocSysMem(&allocInput);
+    }
+
+    return pObjMem;
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::AddrMalloc
+*
+*   @brief
+*       A wrapper of ClientAlloc
+***************************************************************************************************
+*/
+VOID* AddrObject::AddrMalloc(
+    size_t objSize) const   ///< [in] Size to allocate
+{
+    return ClientAlloc(objSize, &m_client);;
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::ClientFree
+*
+*   @brief
+*       Calls freeSysMem inside AddrClient
+***************************************************************************************************
+*/
+VOID AddrObject::ClientFree(
+    VOID*              pObjMem,    ///< [in] User virtual address to free.
+    const AddrClient*  pClient)    ///< [in] Client pointer
+{
+    if (pClient->callbacks.freeSysMem != NULL)
+    {
+        if (pObjMem != NULL)
+        {
+            ADDR_FREESYSMEM_INPUT freeInput = {0};
+
+            freeInput.size      = sizeof(ADDR_FREESYSMEM_INPUT);
+            freeInput.hClient   = pClient->handle;
+            freeInput.pVirtAddr = pObjMem;
+
+            pClient->callbacks.freeSysMem(&freeInput);
+        }
+    }
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::AddrFree
+*
+*   @brief
+*       A wrapper of ClientFree
+***************************************************************************************************
+*/
+VOID AddrObject::AddrFree(
+    VOID* pObjMem) const                 ///< [in] User virtual address to free.
+{
+    ClientFree(pObjMem, &m_client);
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::operator new
+*
+*   @brief
+*       Allocates memory needed for AddrObject object. (with ADDR_CLIENT_HANDLE)
+*
+*   @return
+*       Returns NULL if unsuccessful.
+***************************************************************************************************
+*/
+VOID* AddrObject::operator new(
+    size_t             objSize,    ///< [in] Size to allocate
+    const AddrClient*  pClient)    ///< [in] Client pointer
+{
+    return ClientAlloc(objSize, pClient);
+}
+
+
+/**
+***************************************************************************************************
+*   AddrObject::operator delete
+*
+*   @brief
+*       Frees AddrObject object memory.
+***************************************************************************************************
+*/
+VOID AddrObject::operator delete(
+    VOID* pObjMem,              ///< [in] User virtual address to free.
+    const AddrClient* pClient)  ///< [in] Client handle
+{
+    ClientFree(pObjMem, pClient);
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::operator delete
+*
+*   @brief
+*       Frees AddrObject object memory.
+***************************************************************************************************
+*/
+VOID AddrObject::operator delete(
+    VOID* pObjMem)                  ///< [in] User virtual address to free.
+{
+    AddrObject* pObj = static_cast<AddrObject*>(pObjMem);
+    ClientFree(pObjMem, &pObj->m_client);
+}
+
+/**
+***************************************************************************************************
+*   AddrObject::DebugPrint
+*
+*   @brief
+*       Print debug message
+*
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID AddrObject::DebugPrint(
+    const CHAR* pDebugString,     ///< [in] Debug string
+    ...) const
+{
+#if DEBUG
+    if (m_client.callbacks.debugPrint != NULL)
+    {
+        va_list ap;
+
+        va_start(ap, pDebugString);
+
+        ADDR_DEBUGPRINT_INPUT debugPrintInput = {0};
+
+        debugPrintInput.size         = sizeof(ADDR_DEBUGPRINT_INPUT);
+        debugPrintInput.pDebugString = const_cast<CHAR*>(pDebugString);
+        debugPrintInput.hClient      = m_client.handle;
+        va_copy(debugPrintInput.ap, ap);
+
+        m_client.callbacks.debugPrint(&debugPrintInput);
+
+        va_end(ap);
+    }
+#endif
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.h b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.h
new file mode 100644
index 00000000000..35400885afe
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/core/addrobject.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  addrobject.h
+* @brief Contains the AddrObject base class definition.
+***************************************************************************************************
+*/
+
+#ifndef __ADDR_OBJECT_H__
+#define __ADDR_OBJECT_H__
+
+#include "addrtypes.h"
+#include "addrcommon.h"
+
+/**
+***************************************************************************************************
+* @brief This structure contains client specific data
+***************************************************************************************************
+*/
+struct AddrClient
+{
+    ADDR_CLIENT_HANDLE  handle;
+    ADDR_CALLBACKS      callbacks;
+};
+/**
+***************************************************************************************************
+* @brief This class is the base class for all ADDR class objects.
+***************************************************************************************************
+*/
+class AddrObject
+{
+public:
+    AddrObject();
+    AddrObject(const AddrClient* pClient);
+    virtual ~AddrObject();
+
+    VOID* operator new(size_t size, const AddrClient* pClient);
+    VOID  operator delete(VOID* pObj, const AddrClient* pClient);
+    VOID  operator delete(VOID* pObj);
+    VOID* AddrMalloc(size_t size) const;
+    VOID  AddrFree(VOID* pObj) const;
+
+    VOID DebugPrint(
+        const CHAR* pDebugString,
+        ...) const;
+
+    const AddrClient* GetClient() const {return &m_client;}
+
+protected:
+    AddrClient m_client;
+
+private:
+    static VOID* ClientAlloc(size_t size, const AddrClient* pClient);
+    static VOID  ClientFree(VOID* pObj, const AddrClient* pClient);
+
+    // disallow the copy constructor
+    AddrObject(const AddrObject& a);
+
+    // disallow the assignment operator
+    AddrObject& operator=(const AddrObject& a);
+};
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/inc/chip/r800/si_gb_reg.h b/src/gallium/winsys/amdgpu/drm/addrlib/inc/chip/r800/si_gb_reg.h
new file mode 100644
index 00000000000..cf67f602bdf
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/inc/chip/r800/si_gb_reg.h
@@ -0,0 +1,155 @@
+#if !defined (__SI_GB_REG_H__)
+#define __SI_GB_REG_H__
+
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+//
+// Make sure the necessary endian defines are there.
+//
+#if defined(LITTLEENDIAN_CPU)
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+/*
+ * GB_ADDR_CONFIG struct
+ */
+
+#if     defined(LITTLEENDIAN_CPU)
+
+     typedef struct _GB_ADDR_CONFIG_T {
+          unsigned int num_pipes                      : 3;
+          unsigned int                                : 1;
+          unsigned int pipe_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int bank_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int num_shader_engines             : 2;
+          unsigned int                                : 2;
+          unsigned int shader_engine_tile_size        : 3;
+          unsigned int                                : 1;
+          unsigned int num_gpus                       : 3;
+          unsigned int                                : 1;
+          unsigned int multi_gpu_tile_size            : 2;
+          unsigned int                                : 2;
+          unsigned int row_size                       : 2;
+          unsigned int num_lower_pipes                : 1;
+          unsigned int                                : 1;
+     } GB_ADDR_CONFIG_T;
+
+#elif       defined(BIGENDIAN_CPU)
+
+     typedef struct _GB_ADDR_CONFIG_T {
+          unsigned int                                : 1;
+          unsigned int num_lower_pipes                : 1;
+          unsigned int row_size                       : 2;
+          unsigned int                                : 2;
+          unsigned int multi_gpu_tile_size            : 2;
+          unsigned int                                : 1;
+          unsigned int num_gpus                       : 3;
+          unsigned int                                : 1;
+          unsigned int shader_engine_tile_size        : 3;
+          unsigned int                                : 2;
+          unsigned int num_shader_engines             : 2;
+          unsigned int                                : 1;
+          unsigned int bank_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int pipe_interleave_size           : 3;
+          unsigned int                                : 1;
+          unsigned int num_pipes                      : 3;
+     } GB_ADDR_CONFIG_T;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     GB_ADDR_CONFIG_T f;
+} GB_ADDR_CONFIG;
+
+#if       defined(LITTLEENDIAN_CPU)
+
+     typedef struct _GB_TILE_MODE_T {
+          unsigned int micro_tile_mode                : 2;
+          unsigned int array_mode                     : 4;
+          unsigned int pipe_config                    : 5;
+          unsigned int tile_split                     : 3;
+          unsigned int bank_width                     : 2;
+          unsigned int bank_height                    : 2;
+          unsigned int macro_tile_aspect              : 2;
+          unsigned int num_banks                      : 2;
+          unsigned int micro_tile_mode_new            : 3;
+          unsigned int sample_split                   : 2;
+          unsigned int                                : 5;
+     } GB_TILE_MODE_T;
+
+     typedef struct _GB_MACROTILE_MODE_T {
+          unsigned int bank_width                     : 2;
+          unsigned int bank_height                    : 2;
+          unsigned int macro_tile_aspect              : 2;
+          unsigned int num_banks                      : 2;
+          unsigned int                                : 24;
+     } GB_MACROTILE_MODE_T;
+
+#elif          defined(BIGENDIAN_CPU)
+
+     typedef struct _GB_TILE_MODE_T {
+          unsigned int                                : 5;
+          unsigned int sample_split                   : 2;
+          unsigned int micro_tile_mode_new            : 3;
+          unsigned int num_banks                      : 2;
+          unsigned int macro_tile_aspect              : 2;
+          unsigned int bank_height                    : 2;
+          unsigned int bank_width                     : 2;
+          unsigned int tile_split                     : 3;
+          unsigned int pipe_config                    : 5;
+          unsigned int array_mode                     : 4;
+          unsigned int micro_tile_mode                : 2;
+     } GB_TILE_MODE_T;
+
+     typedef struct _GB_MACROTILE_MODE_T {
+          unsigned int                                : 24;
+          unsigned int num_banks                      : 2;
+          unsigned int macro_tile_aspect              : 2;
+          unsigned int bank_height                    : 2;
+          unsigned int bank_width                     : 2;
+     } GB_MACROTILE_MODE_T;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     GB_TILE_MODE_T f;
+} GB_TILE_MODE;
+
+typedef union {
+     unsigned int val : 32;
+     GB_MACROTILE_MODE_T f;
+} GB_MACROTILE_MODE;
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/inc/lnx_common_defs.h b/src/gallium/winsys/amdgpu/drm/addrlib/inc/lnx_common_defs.h
new file mode 100644
index 00000000000..61540f49b7e
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/inc/lnx_common_defs.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+#ifndef _lnx_common_defs_h_
+#define _lnx_common_defs_h_
+
+#if DBG
+#include <stdarg.h>                         // We do not have any choice: need variable
+                                            // number of parameters support for debug
+                                            // build.
+#endif                                      // #if DBG
+
+//
+// --------------  External functions from Linux kernel driver ----------------
+//
+// Note: The definitions/declararions below must match the original ones.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned long __ke_size_t;              // as it is defined in firegl_public.h
+typedef int           __kernel_ptrdiff_t;       // as it is defined in posix_types.h
+
+
+#if !defined(ATI_API_CALL)
+#define ATI_API_CALL __attribute__((regparm(0)))
+#endif
+
+extern void * ATI_API_CALL __ke_memset(void* s, int c, __ke_size_t count);
+extern void * ATI_API_CALL __ke_memcpy(void* d, const void* s, __ke_size_t count);
+extern ATI_API_CALL __ke_size_t __ke_strlen(const char *s);
+extern char* ATI_API_CALL __ke_strcpy(char* d, const char* s);
+extern char* ATI_API_CALL __ke_strncpy(char* d, const char* s, __ke_size_t count);
+extern void __ke_printk(const char* fmt, ...);
+
+extern int ATI_API_CALL __ke_snprintf(char* buf, __ke_size_t size, const char* fmt, ...);
+extern int ATI_API_CALL KCL_CopyFromUserSpace(void* to, const void* from, __ke_size_t size);
+extern int ATI_API_CALL KCL_CopyToUserSpace(void* to, const void* from, __ke_size_t size);
+#define __ke_copy_from_user  KCL_CopyFromUserSpace
+#define __ke_copy_to_user    KCL_CopyToUserSpace
+extern int ATI_API_CALL __ke_verify_area(int type, const void * addr, unsigned long size);
+
+extern unsigned long ATI_API_CALL KAS_GetTickCounter(void);
+extern unsigned long ATI_API_CALL KAS_GetTicksPerSecond(void);
+
+
+#if DBG
+extern int ATI_API_CALL __ke_vsnprintf(char *buf, __ke_size_t size, const char *fmt, va_list ap);
+#define vsnprintf(_dst, _size, _fmt, varg)  __ke_snprintf(_dst, _size, _fmt, varg)
+#endif                                      // #if DBG
+
+
+// Note: This function is not defined in firegl_public.h.
+void    firegl_hardwareHangRecovery(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+// --------------------------  C/C++ standard typedefs ----------------------------
+//
+#ifdef __SIZE_TYPE__
+typedef __SIZE_TYPE__       size_t;
+#else                                       // #ifdef __SIZE_TYPE__
+typedef unsigned int        size_t;
+#endif                                      // #ifdef __SIZE_TYPE__
+
+#ifdef __PTRDIFF_TYPE__
+typedef __PTRDIFF_TYPE__    ptrdiff_t;
+#else                                       // #ifdef __PTRDIFF_TYPE__
+typedef int                 ptrdiff_t;
+#endif                                      // #ifdef __PTRDIFF_TYPE__
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL    __null
+#else
+#define NULL    ((void *)0)
+#endif
+#endif
+
+
+//
+// -------------------------  C/C++ standard macros ---------------------------
+//
+
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)  // as it is defined in stddef.h
+#define CHAR_BIT            8                                   // as it is defined in limits.h
+
+//
+// ---------------------------------  C RTL -----------------------------------
+//
+
+#define memset(_p, _v, _n)                  __ke_memset(_p, _v, _n)
+#define memcpy(_d, _s, _n)                  __ke_memcpy(_d, _s, _n)
+#define strlen(_s)                          __ke_strlen(_s)
+#define strcpy(_d, _s)                      __ke_strcpy(_d, _s)
+#define strncpy(_d, _s, _n)                 __ke_strncpy(_d, _s, _n)
+// Note: C99 supports macros with variable number of arguments. GCC also supports this C99 feature as
+//       C++ extension.
+#define snprintf(_dst, _size, _fmt, arg...) __ke_snprintf(_dst, _size, _fmt, ##arg)
+
+
+#endif                                      // #ifdef _lnx_common_defs_h_
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/chip/si_ci_vi_merged_enum.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/chip/si_ci_vi_merged_enum.h
new file mode 100644
index 00000000000..5ed81add264
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/chip/si_ci_vi_merged_enum.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+#if !defined (SI_CI_VI_MERGED_ENUM_HEADER)
+#define SI_CI_VI_MERGED_ENUM_HEADER
+
+typedef enum PipeInterleaveSize {
+ADDR_CONFIG_PIPE_INTERLEAVE_256B         = 0x00000000,
+ADDR_CONFIG_PIPE_INTERLEAVE_512B         = 0x00000001,
+} PipeInterleaveSize;
+
+typedef enum RowSize {
+ADDR_CONFIG_1KB_ROW                      = 0x00000000,
+ADDR_CONFIG_2KB_ROW                      = 0x00000001,
+ADDR_CONFIG_4KB_ROW                      = 0x00000002,
+} RowSize;
+
+#endif
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
new file mode 100644
index 00000000000..7393953c120
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
@@ -0,0 +1,1782 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  ciaddrlib.cpp
+* @brief Contains the implementation for the CIAddrLib class.
+***************************************************************************************************
+*/
+
+#include "ciaddrlib.h"
+
+#include "si_gb_reg.h"
+
+#include "si_ci_vi_merged_enum.h"
+
+#if BRAHMA_BUILD
+#include "amdgpu_id.h"
+#else
+#include "ci_id.h"
+#include "kv_id.h"
+#include "vi_id.h"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrMask
+*
+*   @brief
+*       Gets a mask of "width"
+*   @return
+*       Bit mask
+***************************************************************************************************
+*/
+static UINT_64 AddrMask(
+    UINT_32 width)  ///< Width of bits
+{
+    UINT_64 ret;
+
+    if (width >= sizeof(UINT_64)*8)
+    {
+        ret = ~((UINT_64) 0);
+    }
+    else
+    {
+        return (((UINT_64) 1) << width) - 1;
+    }
+    return ret;
+}
+
+/**
+***************************************************************************************************
+*   AddrGetBits
+*
+*   @brief
+*       Gets bits within a range of [msb, lsb]
+*   @return
+*       Bits of this range
+***************************************************************************************************
+*/
+static UINT_64 AddrGetBits(
+    UINT_64 bits,   ///< Source bits
+    UINT_32 msb,    ///< Most signicant bit
+    UINT_32 lsb)    ///< Least signicant bit
+{
+    UINT_64 ret = 0;
+
+    if (msb >= lsb)
+    {
+        ret = (bits >> lsb) & (AddrMask(1 + msb - lsb));
+    }
+    return ret;
+}
+
+/**
+***************************************************************************************************
+*   AddrRemoveBits
+*
+*   @brief
+*       Removes bits within the range of [msb, lsb]
+*   @return
+*       Modified bits
+***************************************************************************************************
+*/
+static UINT_64 AddrRemoveBits(
+    UINT_64 bits,   ///< Source bits
+    UINT_32 msb,    ///< Most signicant bit
+    UINT_32 lsb)    ///< Least signicant bit
+{
+    UINT_64 ret = bits;
+
+    if (msb >= lsb)
+    {
+        ret = AddrGetBits(bits, lsb - 1, 0) // low bits
+            | (AddrGetBits(bits, 8 * sizeof(bits) - 1, msb + 1) << lsb); //high bits
+    }
+    return ret;
+}
+
+/**
+***************************************************************************************************
+*   AddrInsertBits
+*
+*   @brief
+*       Inserts new bits into the range of [msb, lsb]
+*   @return
+*       Modified bits
+***************************************************************************************************
+*/
+static UINT_64 AddrInsertBits(
+    UINT_64 bits,       ///< Source bits
+    UINT_64 newBits,    ///< New bits to be inserted
+    UINT_32 msb,        ///< Most signicant bit
+    UINT_32 lsb)        ///< Least signicant bit
+{
+    UINT_64 ret = bits;
+
+    if (msb >= lsb)
+    {
+        ret = AddrGetBits(bits, lsb - 1, 0) // old low bitss
+             | (AddrGetBits(newBits, msb - lsb, 0) << lsb) //new bits
+             | (AddrGetBits(bits, 8 * sizeof(bits) - 1, lsb) << (msb + 1)); //old high bits
+    }
+    return ret;
+}
+
+
+/**
+***************************************************************************************************
+*   AddrCIHwlInit
+*
+*   @brief
+*       Creates an CIAddrLib object.
+*
+*   @return
+*       Returns an CIAddrLib object pointer.
+***************************************************************************************************
+*/
+AddrLib* AddrCIHwlInit(const AddrClient* pClient)
+{
+    return CIAddrLib::CreateObj(pClient);
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::CIAddrLib
+*
+*   @brief
+*       Constructor
+*
+***************************************************************************************************
+*/
+CIAddrLib::CIAddrLib(const AddrClient* pClient) :
+    SIAddrLib(pClient),
+    m_noOfMacroEntries(0),
+    m_allowNonDispThickModes(FALSE)
+{
+    m_class = CI_ADDRLIB;
+    memset(&m_settings, 0, sizeof(m_settings));
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::~CIAddrLib
+*
+*   @brief
+*       Destructor
+***************************************************************************************************
+*/
+CIAddrLib::~CIAddrLib()
+{
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeDccInfo
+*
+*   @brief
+*       Compute DCC key size, base alignment
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlComputeDccInfo(
+    const ADDR_COMPUTE_DCCINFO_INPUT*  pIn,
+    ADDR_COMPUTE_DCCINFO_OUTPUT*       pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    if (m_settings.isVolcanicIslands && IsMacroTiled(pIn->tileMode))
+    {
+        UINT_64 dccFastClearSize = pIn->colorSurfSize >> 8;
+
+        ADDR_ASSERT(0 == (pIn->colorSurfSize & 0xff));
+
+        if (pIn->numSamples > 1)
+        {
+            UINT_32 tileSizePerSample = BITS_TO_BYTES(pIn->bpp * MicroTileWidth * MicroTileHeight);
+            UINT_32 samplesPerSplit  = pIn->tileInfo.tileSplitBytes / tileSizePerSample;
+
+            if (samplesPerSplit < pIn->numSamples)
+            {
+                UINT_32 numSplits = pIn->numSamples / samplesPerSplit;
+                UINT_32 fastClearBaseAlign = HwlGetPipes(&pIn->tileInfo) * m_pipeInterleaveBytes;
+
+                ADDR_ASSERT(IsPow2(fastClearBaseAlign));
+
+                dccFastClearSize /= numSplits;
+
+                if (0 != (dccFastClearSize & (fastClearBaseAlign - 1)))
+                {
+                    // Disable dcc fast clear
+                    // if key size of fisrt sample split is not pipe*interleave aligned
+                    dccFastClearSize = 0;
+                }
+            }
+        }
+
+        pOut->dccRamSize          = pIn->colorSurfSize >> 8;
+        pOut->dccRamBaseAlign     = pIn->tileInfo.banks *
+                                    HwlGetPipes(&pIn->tileInfo) *
+                                    m_pipeInterleaveBytes;
+        pOut->dccFastClearSize    = dccFastClearSize;
+
+        ADDR_ASSERT(IsPow2(pOut->dccRamBaseAlign));
+
+        if (0 == (pOut->dccRamSize & (pOut->dccRamBaseAlign - 1)))
+        {
+            pOut->subLvlCompressible = TRUE;
+        }
+        else
+        {
+            UINT_64 dccRamSizeAlign = HwlGetPipes(&pIn->tileInfo) * m_pipeInterleaveBytes;
+
+            if (pOut->dccRamSize == pOut->dccFastClearSize)
+            {
+                pOut->dccFastClearSize = PowTwoAlign(pOut->dccRamSize, dccRamSizeAlign);
+            }
+            pOut->dccRamSize          = PowTwoAlign(pOut->dccRamSize, dccRamSizeAlign);
+            pOut->subLvlCompressible  = FALSE;
+        }
+    }
+    else
+    {
+        returnCode = ADDR_NOTSUPPORTED;
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeCmaskAddrFromCoord
+*
+*   @brief
+*       Compute tc compatible Cmask address from fmask ram address
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlComputeCmaskAddrFromCoord(
+    const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT*  pIn,  ///< [in] fmask addr/bpp/tile input
+    ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT*       pOut  ///< [out] cmask address
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_NOTSUPPORTED;
+
+    if ((m_settings.isVolcanicIslands == TRUE) &&
+        (pIn->flags.tcCompatible == TRUE))
+    {
+        UINT_32 numOfPipes   = HwlGetPipes(pIn->pTileInfo);
+        UINT_32 numOfBanks   = pIn->pTileInfo->banks;
+        UINT_64 fmaskAddress = pIn->fmaskAddr;
+        UINT_32 elemBits     = pIn->bpp;
+        UINT_32 blockByte    = 64 * elemBits / 8;
+        UINT_64 metaNibbleAddress = HwlComputeMetadataNibbleAddress(fmaskAddress,
+                                                                    0,
+                                                                    0,
+                                                                    4,
+                                                                    elemBits,
+                                                                    blockByte,
+                                                                    m_pipeInterleaveBytes,
+                                                                    numOfPipes,
+                                                                    numOfBanks,
+                                                                    1);
+        pOut->addr = (metaNibbleAddress >> 1);
+        pOut->bitPosition = (metaNibbleAddress % 2) ? 4 : 0;
+        returnCode = ADDR_OK;
+    }
+
+    return returnCode;
+}
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlConvertChipFamily
+*
+*   @brief
+*       Convert familyID defined in atiid.h to AddrChipFamily and set m_chipFamily/m_chipRevision
+*   @return
+*       AddrChipFamily
+***************************************************************************************************
+*/
+AddrChipFamily CIAddrLib::HwlConvertChipFamily(
+    UINT_32 uChipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 uChipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
+{
+    AddrChipFamily family = ADDR_CHIP_FAMILY_CI;
+
+    switch (uChipFamily)
+    {
+        case FAMILY_CI:
+            m_settings.isSeaIsland  = 1;
+            m_settings.isBonaire    = ASICREV_IS_BONAIRE_M(uChipRevision);
+            m_settings.isHawaii     = ASICREV_IS_HAWAII_P(uChipRevision);
+            break;
+        case FAMILY_KV:
+            m_settings.isKaveri     = 1;
+            m_settings.isSpectre    = ASICREV_IS_SPECTRE(uChipRevision);
+            m_settings.isSpooky     = ASICREV_IS_SPOOKY(uChipRevision);
+            m_settings.isKalindi    = ASICREV_IS_KALINDI(uChipRevision);
+            break;
+        case FAMILY_VI:
+            m_settings.isVolcanicIslands = 1;
+            m_settings.isIceland         = ASICREV_IS_ICELAND_M(uChipRevision);
+            m_settings.isTonga           = ASICREV_IS_TONGA_P(uChipRevision);
+            m_settings.isFiji            = ASICREV_IS_FIJI_P(uChipRevision);
+            break;
+        case FAMILY_CZ:
+            m_settings.isCarrizo         = 1;
+            m_settings.isVolcanicIslands = 1;
+            break;
+        default:
+            ADDR_ASSERT(!"This should be a unexpected Fusion");
+            break;
+    }
+
+    return family;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlInitGlobalParams
+*
+*   @brief
+*       Initializes global parameters
+*
+*   @return
+*       TRUE if all settings are valid
+*
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::HwlInitGlobalParams(
+    const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input
+{
+    BOOL_32  valid = TRUE;
+
+    const ADDR_REGISTER_VALUE* pRegValue = &pCreateIn->regValue;
+
+    valid = DecodeGbRegs(pRegValue);
+
+    // The following assignments for m_pipes is only for fail-safe, InitTileSettingTable should
+    // read the correct pipes from tile mode table
+    if (m_settings.isHawaii)
+    {
+        // Hawaii has 16-pipe, see GFXIP_Config_Summary.xls
+        m_pipes = 16;
+    }
+    else if (m_settings.isBonaire || m_settings.isSpectre)
+    {
+        m_pipes = 4;
+    }
+    else // Treat other KV asics to be 2-pipe
+    {
+        m_pipes = 2;
+    }
+
+    // @todo: VI
+    // Move this to VI code path once created
+    if (m_settings.isTonga)
+    {
+        m_pipes = 8;
+    }
+    else if (m_settings.isIceland)
+    {
+        m_pipes = 2;
+    }
+    else if (m_settings.isFiji)
+    {
+        m_pipes = 16;
+    }
+
+    if (valid)
+    {
+        valid = InitTileSettingTable(pRegValue->pTileConfig, pRegValue->noOfEntries);
+    }
+    if (valid)
+    {
+        valid = InitMacroTileCfgTable(pRegValue->pMacroTileConfig, pRegValue->noOfMacroEntries);
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlPostCheckTileIndex
+*
+*   @brief
+*       Map a tile setting to index if curIndex is invalid, otherwise check if curIndex matches
+*       tile mode/type/info and change the index if needed
+*   @return
+*       Tile index.
+***************************************************************************************************
+*/
+INT_32 CIAddrLib::HwlPostCheckTileIndex(
+    const ADDR_TILEINFO* pInfo,     ///< [in] Tile Info
+    AddrTileMode         mode,      ///< [in] Tile mode
+    AddrTileType         type,      ///< [in] Tile type
+    INT                  curIndex   ///< [in] Current index assigned in HwlSetupTileInfo
+    ) const
+{
+    INT_32 index = curIndex;
+
+    if (mode == ADDR_TM_LINEAR_GENERAL)
+    {
+        index = TileIndexLinearGeneral;
+    }
+    else
+    {
+        BOOL_32 macroTiled = IsMacroTiled(mode);
+
+        // We need to find a new index if either of them is true
+        // 1. curIndex is invalid
+        // 2. tile mode is changed
+        // 3. tile info does not match for macro tiled
+        if ((index == TileIndexInvalid)         ||
+            (mode != m_tileTable[index].mode)   ||
+            (macroTiled && pInfo->pipeConfig != m_tileTable[index].info.pipeConfig))
+        {
+            for (index = 0; index < static_cast<INT_32>(m_noOfEntries); index++)
+            {
+                if (macroTiled)
+                {
+                    // macro tile modes need all to match
+                    if ((pInfo->pipeConfig == m_tileTable[index].info.pipeConfig) &&
+                        (mode == m_tileTable[index].mode) &&
+                        (type == m_tileTable[index].type))
+                    {
+                        // tileSplitBytes stored in m_tileTable is only valid for depth entries
+                        if (type == ADDR_DEPTH_SAMPLE_ORDER)
+                        {
+                            if (pInfo->tileSplitBytes == m_tileTable[index].info.tileSplitBytes)
+                            {
+                                break;
+                            }
+                        }
+                        else // other entries are determined by other 3 fields
+                        {
+                            break;
+                        }
+                    }
+                }
+                else if (mode == ADDR_TM_LINEAR_ALIGNED)
+                {
+                    // linear mode only needs tile mode to match
+                    if (mode == m_tileTable[index].mode)
+                    {
+                        break;
+                    }
+                }
+                else
+                {
+                    // micro tile modes only need tile mode and tile type to match
+                    if (mode == m_tileTable[index].mode &&
+                        type == m_tileTable[index].type)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    ADDR_ASSERT(index < static_cast<INT_32>(m_noOfEntries));
+
+    if (index >= static_cast<INT_32>(m_noOfEntries))
+    {
+        index = TileIndexInvalid;
+    }
+
+    return index;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlSetupTileCfg
+*
+*   @brief
+*       Map tile index to tile setting.
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlSetupTileCfg(
+    INT_32          index,          ///< [in] Tile index
+    INT_32          macroModeIndex, ///< [in] Index in macro tile mode table(CI)
+    ADDR_TILEINFO*  pInfo,          ///< [out] Tile Info
+    AddrTileMode*   pMode,          ///< [out] Tile mode
+    AddrTileType*   pType           ///< [out] Tile type
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    // Global flag to control usage of tileIndex
+    if (UseTileIndex(index))
+    {
+        if (static_cast<UINT_32>(index) >= m_noOfEntries)
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        else
+        {
+            const ADDR_TILECONFIG* pCfgTable = GetTileSetting(index);
+
+            if (pInfo != NULL)
+            {
+                if (IsMacroTiled(pCfgTable->mode))
+                {
+                    ADDR_ASSERT(((macroModeIndex != TileIndexInvalid)
+                        && (macroModeIndex != TileIndexNoMacroIndex)));
+                    // Here we used tile_bytes to replace of tile_split
+                    // According info as below:
+                    // "tile_split_c = MIN(ROW_SIZE, tile_split)
+                    // "tile_bytes = MIN(tile_split_c, num_samples * tile_bytes_1x)
+                    // when using tile_bytes replacing of tile_split, the result of
+                    // alignment and others(such as slicesPerTile) are unaffected -
+                    // since if tile_split_c is larger, split won't happen, otherwise
+                    // (num_samples * tile_bytes_1x is larger), a correct tile_split is
+                    // returned.
+                    *pInfo = m_macroTileTable[macroModeIndex];
+
+                    if (pCfgTable->type == ADDR_DEPTH_SAMPLE_ORDER)
+                    {
+                        pInfo->tileSplitBytes = pCfgTable->info.tileSplitBytes;
+                    }
+                    pInfo->pipeConfig = pCfgTable->info.pipeConfig;
+                }
+                else // 1D and linear modes, we return default value stored in table
+                {
+                    *pInfo = pCfgTable->info;
+                }
+            }
+
+            if (pMode != NULL)
+            {
+                *pMode = pCfgTable->mode;
+            }
+
+            if (pType != NULL)
+            {
+                *pType = pCfgTable->type;
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeSurfaceInfo
+*
+*   @brief
+*       Entry of ci's ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlComputeSurfaceInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    // If tileIndex is invalid, force macroModeIndex to be invalid, too
+    if (pIn->tileIndex == TileIndexInvalid)
+    {
+        pOut->macroModeIndex = TileIndexInvalid;
+    }
+
+    ADDR_E_RETURNCODE retCode = SIAddrLib::HwlComputeSurfaceInfo(pIn,pOut);
+
+    if (pOut->macroModeIndex == TileIndexNoMacroIndex)
+    {
+        pOut->macroModeIndex = TileIndexInvalid;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlFmaskSurfaceInfo
+*   @brief
+*       Entry of r800's ComputeFmaskInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE CIAddrLib::HwlComputeFmaskInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,   ///< [in] input structure
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut   ///< [out] output structure
+    )
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    ADDR_TILEINFO tileInfo = {0};
+    ADDR_COMPUTE_FMASK_INFO_INPUT fmaskIn;
+    fmaskIn = *pIn;
+
+    AddrTileMode tileMode = pIn->tileMode;
+
+    // Use internal tile info if pOut does not have a valid pTileInfo
+    if (pOut->pTileInfo == NULL)
+    {
+        pOut->pTileInfo = &tileInfo;
+    }
+
+    ADDR_ASSERT(tileMode == ADDR_TM_2D_TILED_THIN1     ||
+                tileMode == ADDR_TM_3D_TILED_THIN1     ||
+                tileMode == ADDR_TM_PRT_TILED_THIN1    ||
+                tileMode == ADDR_TM_PRT_2D_TILED_THIN1 ||
+                tileMode == ADDR_TM_PRT_3D_TILED_THIN1);
+
+    ADDR_ASSERT(m_tileTable[14].mode == ADDR_TM_2D_TILED_THIN1);
+    ADDR_ASSERT(m_tileTable[15].mode == ADDR_TM_3D_TILED_THIN1);
+
+    // The only valid tile modes for fmask are 2D_THIN1 and 3D_THIN1 plus non-displayable
+    INT_32 tileIndex = tileMode == ADDR_TM_2D_TILED_THIN1 ? 14 : 15;
+    ADDR_SURFACE_FLAGS flags = {{0}};
+    flags.fmask = 1;
+
+    INT_32 macroModeIndex = TileIndexInvalid;
+
+    UINT_32 numSamples = pIn->numSamples;
+    UINT_32 numFrags = pIn->numFrags == 0 ? numSamples : pIn->numFrags;
+
+    UINT_32 bpp = QLog2(numFrags);
+
+    // EQAA needs one more bit
+    if (numSamples > numFrags)
+    {
+        bpp++;
+    }
+
+    if (bpp == 3)
+    {
+        bpp = 4;
+    }
+
+    bpp = Max(8u, bpp * numSamples);
+
+    macroModeIndex = HwlComputeMacroModeIndex(tileIndex, flags, bpp, numSamples, pOut->pTileInfo);
+
+    fmaskIn.tileIndex = tileIndex;
+    fmaskIn.pTileInfo = pOut->pTileInfo;
+    pOut->macroModeIndex = macroModeIndex;
+    pOut->tileIndex = tileIndex;
+
+    retCode = DispatchComputeFmaskInfo(&fmaskIn, pOut);
+
+    if (retCode == ADDR_OK)
+    {
+        pOut->tileIndex =
+            HwlPostCheckTileIndex(pOut->pTileInfo, pIn->tileMode, ADDR_NON_DISPLAYABLE,
+                                  pOut->tileIndex);
+    }
+
+    // Resets pTileInfo to NULL if the internal tile info is used
+    if (pOut->pTileInfo == &tileInfo)
+    {
+        pOut->pTileInfo = NULL;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlFmaskPreThunkSurfInfo
+*
+*   @brief
+*       Some preparation before thunking a ComputeSurfaceInfo call for Fmask
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlFmaskPreThunkSurfInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pFmaskIn,   ///< [in] Input of fmask info
+    const ADDR_COMPUTE_FMASK_INFO_OUTPUT*   pFmaskOut,  ///< [in] Output of fmask info
+    ADDR_COMPUTE_SURFACE_INFO_INPUT*        pSurfIn,    ///< [out] Input of thunked surface info
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pSurfOut    ///< [out] Output of thunked surface info
+    ) const
+{
+    pSurfIn->tileIndex = pFmaskIn->tileIndex;
+    pSurfOut->macroModeIndex  = pFmaskOut->macroModeIndex;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlFmaskPostThunkSurfInfo
+*
+*   @brief
+*       Copy hwl extra field after calling thunked ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlFmaskPostThunkSurfInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,   ///< [in] Output of surface info
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut           ///< [out] Output of fmask info
+    ) const
+{
+    pFmaskOut->tileIndex = pSurfOut->tileIndex;
+    pFmaskOut->macroModeIndex = pSurfOut->macroModeIndex;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlDegradeThickTileMode
+*
+*   @brief
+*       Degrades valid tile mode for thick modes if needed
+*
+*   @return
+*       Suitable tile mode
+***************************************************************************************************
+*/
+AddrTileMode CIAddrLib::HwlDegradeThickTileMode(
+    AddrTileMode        baseTileMode,   ///< [in] base tile mode
+    UINT_32             numSlices,      ///< [in] current number of slices
+    UINT_32*            pBytesPerTile   ///< [in/out] pointer to bytes per slice
+    ) const
+{
+    return baseTileMode;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlOverrideTileMode
+*
+*   @brief
+*       Override THICK to THIN, for specific formats on CI
+*
+*   @return
+*       Suitable tile mode
+*
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::HwlOverrideTileMode(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,       ///< [in] input structure
+    AddrTileMode*                           pTileMode, ///< [in/out] pointer to the tile mode
+    AddrTileType*                           pTileType  ///< [in/out] pointer to the tile type
+    ) const
+{
+    BOOL_32 bOverrided = FALSE;
+    AddrTileMode tileMode = *pTileMode;
+
+    // currently, all CI/VI family do not
+    // support ADDR_TM_PRT_2D_TILED_THICK,ADDR_TM_PRT_3D_TILED_THICK and
+    // ADDR_TM_PRT_2D_TILED_THIN1, ADDR_TM_PRT_3D_TILED_THIN1
+    switch (tileMode)
+    {
+        case ADDR_TM_PRT_2D_TILED_THICK:
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            tileMode = ADDR_TM_PRT_TILED_THICK;
+            break;
+        case ADDR_TM_PRT_2D_TILED_THIN1:
+        case ADDR_TM_PRT_3D_TILED_THIN1:
+            tileMode = ADDR_TM_PRT_TILED_THIN1;
+            break;
+        default:
+            break;
+    }
+
+    // UBTS#404321, we do not need such overriding, as THICK+THICK entries removed from the tile-mode table
+    if (!m_settings.isBonaire)
+    {
+        UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+        // tile_thickness = (array_mode == XTHICK) ? 8 : ((array_mode == THICK) ? 4 : 1)
+        if (thickness > 1)
+        {
+            switch (pIn->format)
+            {
+                // see //gfxip/gcB/devel/cds/src/verif/tc/models/csim/tcp.cpp
+                // tcpError("Thick micro tiling is not supported for format...
+                case ADDR_FMT_X24_8_32_FLOAT:
+                case ADDR_FMT_32_AS_8:
+                case ADDR_FMT_32_AS_8_8:
+                case ADDR_FMT_32_AS_32_32_32_32:
+
+                // packed formats
+                case ADDR_FMT_GB_GR:
+                case ADDR_FMT_BG_RG:
+                case ADDR_FMT_1_REVERSED:
+                case ADDR_FMT_1:
+                case ADDR_FMT_BC1:
+                case ADDR_FMT_BC2:
+                case ADDR_FMT_BC3:
+                case ADDR_FMT_BC4:
+                case ADDR_FMT_BC5:
+                case ADDR_FMT_BC6:
+                case ADDR_FMT_BC7:
+                    switch (tileMode)
+                    {
+                        case ADDR_TM_1D_TILED_THICK:
+                            tileMode    = ADDR_TM_1D_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_2D_TILED_XTHICK:
+                        case ADDR_TM_2D_TILED_THICK:
+                            tileMode    = ADDR_TM_2D_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_3D_TILED_XTHICK:
+                        case ADDR_TM_3D_TILED_THICK:
+                            tileMode    = ADDR_TM_3D_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_PRT_TILED_THICK:
+                            tileMode    = ADDR_TM_PRT_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_PRT_2D_TILED_THICK:
+                            tileMode    = ADDR_TM_PRT_2D_TILED_THIN1;
+                            break;
+
+                        case ADDR_TM_PRT_3D_TILED_THICK:
+                            tileMode    = ADDR_TM_PRT_3D_TILED_THIN1;
+                            break;
+
+                        default:
+                            break;
+
+                    }
+
+                    // Switch tile type from thick to thin
+                    if (tileMode != *pTileMode)
+                    {
+                        // see tileIndex: 13-18
+                        *pTileType = ADDR_NON_DISPLAYABLE;
+                    }
+
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (tileMode != *pTileMode)
+    {
+        *pTileMode = tileMode;
+        bOverrided = TRUE;
+    }
+
+    return bOverrided;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlSetupTileInfo
+*
+*   @brief
+*       Setup default value of tile info for SI
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlSetupTileInfo(
+    AddrTileMode                        tileMode,       ///< [in] Tile mode
+    ADDR_SURFACE_FLAGS                  flags,          ///< [in] Surface type flags
+    UINT_32                             bpp,            ///< [in] Bits per pixel
+    UINT_32                             pitch,          ///< [in] Pitch in pixels
+    UINT_32                             height,         ///< [in] Height in pixels
+    UINT_32                             numSamples,     ///< [in] Number of samples
+    ADDR_TILEINFO*                      pTileInfoIn,    ///< [in] Tile info input: NULL for default
+    ADDR_TILEINFO*                      pTileInfoOut,   ///< [out] Tile info output
+    AddrTileType                        inTileType,     ///< [in] Tile type
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*   pOut            ///< [out] Output
+    ) const
+{
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+    ADDR_TILEINFO* pTileInfo = pTileInfoOut;
+    INT index = TileIndexInvalid;
+    INT macroModeIndex = TileIndexInvalid;
+
+    // Fail-safe code
+    if (!IsLinear(tileMode))
+    {
+        // Thick tile modes must use thick micro tile mode but Bonaire does not support due to
+        // old derived netlists (UBTS 404321)
+        if (thickness > 1)
+        {
+            if (m_settings.isBonaire)
+            {
+                inTileType = ADDR_NON_DISPLAYABLE;
+            }
+            else if ((m_allowNonDispThickModes == FALSE) || (inTileType != ADDR_NON_DISPLAYABLE))
+            {
+                inTileType = ADDR_THICK;
+            }
+        }
+        // 128 bpp tiling must be non-displayable.
+        // Fmask reuse color buffer's entry but bank-height field can be from another entry
+        // To simplify the logic, fmask entry should be picked from non-displayable ones
+        else if (bpp == 128 || flags.fmask)
+        {
+            inTileType = ADDR_NON_DISPLAYABLE;
+        }
+        // These two modes only have non-disp entries though they can be other micro tile modes
+        else if (tileMode == ADDR_TM_3D_TILED_THIN1 || tileMode == ADDR_TM_PRT_3D_TILED_THIN1)
+        {
+            inTileType = ADDR_NON_DISPLAYABLE;
+        }
+
+        if (flags.depth || flags.stencil)
+        {
+            inTileType = ADDR_DEPTH_SAMPLE_ORDER;
+        }
+    }
+
+    if (IsTileInfoAllZero(pTileInfo))
+    {
+        // See table entries 0-4
+        if (flags.depth || flags.stencil)
+        {
+            if (flags.depth && flags.tcCompatible)
+            {
+                // tileSize = bpp * numSamples * 8 * 8 / 8
+                UINT_32 tileSize = bpp * numSamples * 8;
+
+                // Texure readable depth surface should not be split
+                switch (tileSize)
+                {
+                    case 128:
+                        index = 1;
+                        break;
+                    case 256:
+                        index = 2;
+                        break;
+                    case 512:
+                        index = 3;
+                        break;
+                    default:
+                        index = 4;
+                        break;
+                }
+            }
+            else
+            {
+                // Depth and stencil need to use the same index, thus the pre-defined tile_split
+                // can meet the requirement to choose the same macro mode index
+                // uncompressed depth/stencil are not supported for now
+                switch (numSamples)
+                {
+                    case 1:
+                        index = 0;
+                        break;
+                    case 2:
+                    case 4:
+                        index = 1;
+                        break;
+                    case 8:
+                        index = 2;
+                        break;
+                    default:
+                        break;
+                }
+            }
+        }
+
+        // See table entries 5-6
+        if (inTileType == ADDR_DEPTH_SAMPLE_ORDER)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_1D_TILED_THIN1:
+                    index = 5;
+                    break;
+                case ADDR_TM_PRT_TILED_THIN1:
+                    index = 6;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // See table entries 8-12
+        if (inTileType == ADDR_DISPLAYABLE)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_1D_TILED_THIN1:
+                    index = 9;
+                    break;
+                case ADDR_TM_2D_TILED_THIN1:
+                    index = 10;
+                    break;
+                case ADDR_TM_PRT_TILED_THIN1:
+                    index = 11;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // See table entries 13-18
+        if (inTileType == ADDR_NON_DISPLAYABLE)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_1D_TILED_THIN1:
+                    index = 13;
+                    break;
+                case ADDR_TM_2D_TILED_THIN1:
+                    index = 14;
+                    break;
+                case ADDR_TM_3D_TILED_THIN1:
+                    index = 15;
+                    break;
+                case ADDR_TM_PRT_TILED_THIN1:
+                    index = 16;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // See table entries 19-26
+        if (thickness > 1)
+        {
+            switch (tileMode)
+            {
+            case ADDR_TM_1D_TILED_THICK:
+                    //special check for bonaire, for the compatablity between old KMD and new UMD for bonaire
+                    index = ((inTileType == ADDR_THICK) || m_settings.isBonaire) ? 19 : 18;
+                    break;
+            case ADDR_TM_2D_TILED_THICK:
+                    // special check for bonaire, for the compatablity between old KMD and new UMD for bonaire
+                    index = ((inTileType == ADDR_THICK) || m_settings.isBonaire) ? 20 : 24;
+                    break;
+                case ADDR_TM_3D_TILED_THICK:
+                    index = 21;
+                    break;
+                case ADDR_TM_PRT_TILED_THICK:
+                    index = 22;
+                    break;
+                case ADDR_TM_2D_TILED_XTHICK:
+                    index = 25;
+                    break;
+                case ADDR_TM_3D_TILED_XTHICK:
+                    index = 26;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // See table entries 27-30
+        if (inTileType == ADDR_ROTATED)
+        {
+            switch (tileMode)
+            {
+                case ADDR_TM_1D_TILED_THIN1:
+                    index = 27;
+                    break;
+                case ADDR_TM_2D_TILED_THIN1:
+                    index = 28;
+                    break;
+                case ADDR_TM_PRT_TILED_THIN1:
+                    index = 29;
+                    break;
+                case ADDR_TM_PRT_2D_TILED_THIN1:
+                    index = 30;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        if (m_pipes >= 8)
+        {
+            ADDR_ASSERT((index + 1) < static_cast<INT_32>(m_noOfEntries));
+            // Only do this when tile mode table is updated.
+            if (((tileMode == ADDR_TM_PRT_TILED_THIN1) || (tileMode == ADDR_TM_PRT_TILED_THICK)) &&
+                (m_tileTable[index+1].mode == tileMode))
+            {
+                UINT_32 bytesXSamples = bpp * numSamples / 8;
+                UINT_32 bytesXThickness = bpp * thickness / 8;
+                UINT_32 switchP4Threshold = (m_pipes == 16) ? 8 : 32;
+
+                if ((bytesXSamples > switchP4Threshold) || (bytesXThickness > switchP4Threshold))
+                {
+                    // Pick next 4 pipe entry
+                    index += 1;
+                }
+            }
+        }
+    }
+    else
+    {
+        // A pre-filled tile info is ready
+        index = pOut->tileIndex;
+        macroModeIndex = pOut->macroModeIndex;
+
+        // pass tile type back for post tile index compute
+        pOut->tileType = inTileType;
+    }
+
+    // We only need to set up tile info if there is a valid index but macroModeIndex is invalid
+    if (index != TileIndexInvalid && macroModeIndex == TileIndexInvalid)
+    {
+        macroModeIndex = HwlComputeMacroModeIndex(index, flags, bpp, numSamples, pTileInfo);
+
+        /// Copy to pOut->tileType/tileIndex/macroModeIndex
+        pOut->tileIndex = index;
+        pOut->tileType = m_tileTable[index].type; // Or inTileType, the samea
+        pOut->macroModeIndex = macroModeIndex;
+    }
+    else if (tileMode == ADDR_TM_LINEAR_GENERAL)
+    {
+        pOut->tileIndex = TileIndexLinearGeneral;
+
+        // Copy linear-aligned entry??
+        *pTileInfo = m_tileTable[8].info;
+    }
+    else if (tileMode == ADDR_TM_LINEAR_ALIGNED)
+    {
+        pOut->tileIndex = 8;
+        *pTileInfo = m_tileTable[8].info;
+    }
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::ReadGbTileMode
+*
+*   @brief
+*       Convert GB_TILE_MODE HW value to ADDR_TILE_CONFIG.
+*   @return
+*       NA.
+***************************************************************************************************
+*/
+VOID CIAddrLib::ReadGbTileMode(
+    UINT_32             regValue,   ///< [in] GB_TILE_MODE register
+    ADDR_TILECONFIG*    pCfg        ///< [out] output structure
+    ) const
+{
+    GB_TILE_MODE gbTileMode;
+    gbTileMode.val = regValue;
+
+    pCfg->type = static_cast<AddrTileType>(gbTileMode.f.micro_tile_mode_new);
+    pCfg->info.pipeConfig = static_cast<AddrPipeCfg>(gbTileMode.f.pipe_config + 1);
+
+    if (pCfg->type == ADDR_DEPTH_SAMPLE_ORDER)
+    {
+        pCfg->info.tileSplitBytes = 64 << gbTileMode.f.tile_split;
+    }
+    else
+    {
+        pCfg->info.tileSplitBytes = 1 << gbTileMode.f.sample_split;
+    }
+
+    UINT_32 regArrayMode = gbTileMode.f.array_mode;
+
+    pCfg->mode = static_cast<AddrTileMode>(regArrayMode);
+
+    switch (regArrayMode)
+    {
+        case 5:
+            pCfg->mode = ADDR_TM_PRT_TILED_THIN1;
+            break;
+        case 6:
+            pCfg->mode = ADDR_TM_PRT_2D_TILED_THIN1;
+            break;
+        case 8:
+            pCfg->mode = ADDR_TM_2D_TILED_XTHICK;
+            break;
+        case 9:
+            pCfg->mode = ADDR_TM_PRT_TILED_THICK;
+            break;
+        case 0xa:
+            pCfg->mode = ADDR_TM_PRT_2D_TILED_THICK;
+            break;
+        case 0xb:
+            pCfg->mode = ADDR_TM_PRT_3D_TILED_THIN1;
+            break;
+        case 0xe:
+            pCfg->mode = ADDR_TM_3D_TILED_XTHICK;
+            break;
+        case 0xf:
+            pCfg->mode = ADDR_TM_PRT_3D_TILED_THICK;
+            break;
+        default:
+            break;
+    }
+
+    // Fail-safe code for these always convert tile info, as the non-macro modes
+    // return the entry of tile mode table directly without looking up macro mode table
+    if (!IsMacroTiled(pCfg->mode))
+    {
+        pCfg->info.banks = 2;
+        pCfg->info.bankWidth = 1;
+        pCfg->info.bankHeight = 1;
+        pCfg->info.macroAspectRatio = 1;
+        pCfg->info.tileSplitBytes = 64;
+    }
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::InitTileSettingTable
+*
+*   @brief
+*       Initialize the ADDR_TILE_CONFIG table.
+*   @return
+*       TRUE if tile table is correctly initialized
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::InitTileSettingTable(
+    const UINT_32*  pCfg,           ///< [in] Pointer to table of tile configs
+    UINT_32         noOfEntries     ///< [in] Numbe of entries in the table above
+    )
+{
+    BOOL_32 initOk = TRUE;
+
+    ADDR_ASSERT(noOfEntries <= TileTableSize);
+
+    memset(m_tileTable, 0, sizeof(m_tileTable));
+
+    if (noOfEntries != 0)
+    {
+        m_noOfEntries = noOfEntries;
+    }
+    else
+    {
+        m_noOfEntries = TileTableSize;
+    }
+
+    if (pCfg) // From Client
+    {
+        for (UINT_32 i = 0; i < m_noOfEntries; i++)
+        {
+            ReadGbTileMode(*(pCfg + i), &m_tileTable[i]);
+        }
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        initOk = FALSE;
+    }
+
+    if (initOk)
+    {
+        ADDR_ASSERT(m_tileTable[TILEINDEX_LINEAR_ALIGNED].mode == ADDR_TM_LINEAR_ALIGNED);
+
+        if (m_settings.isBonaire == FALSE)
+        {
+            // Check if entry 18 is "thick+thin" combination
+            if ((m_tileTable[18].mode == ADDR_TM_1D_TILED_THICK) &&
+                (m_tileTable[18].type == ADDR_NON_DISPLAYABLE))
+            {
+                m_allowNonDispThickModes = TRUE;
+                ADDR_ASSERT(m_tileTable[24].mode == ADDR_TM_2D_TILED_THICK);
+            }
+        }
+        else
+        {
+            m_allowNonDispThickModes = TRUE;
+        }
+
+        // Assume the first entry is always programmed with full pipes
+        m_pipes = HwlGetPipes(&m_tileTable[0].info);
+    }
+
+    return initOk;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::ReadGbMacroTileCfg
+*
+*   @brief
+*       Convert GB_MACRO_TILE_CFG HW value to ADDR_TILE_CONFIG.
+*   @return
+*       NA.
+***************************************************************************************************
+*/
+VOID CIAddrLib::ReadGbMacroTileCfg(
+    UINT_32             regValue,   ///< [in] GB_MACRO_TILE_MODE register
+    ADDR_TILEINFO*      pCfg        ///< [out] output structure
+    ) const
+{
+    GB_MACROTILE_MODE gbTileMode;
+    gbTileMode.val = regValue;
+
+    pCfg->bankHeight = 1 << gbTileMode.f.bank_height;
+    pCfg->bankWidth = 1 << gbTileMode.f.bank_width;
+    pCfg->banks = 1 << (gbTileMode.f.num_banks + 1);
+    pCfg->macroAspectRatio = 1 << gbTileMode.f.macro_tile_aspect;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::InitMacroTileCfgTable
+*
+*   @brief
+*       Initialize the ADDR_MACRO_TILE_CONFIG table.
+*   @return
+*       TRUE if macro tile table is correctly initialized
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::InitMacroTileCfgTable(
+    const UINT_32*  pCfg,           ///< [in] Pointer to table of tile configs
+    UINT_32         noOfMacroEntries     ///< [in] Numbe of entries in the table above
+    )
+{
+    BOOL_32 initOk = TRUE;
+
+    ADDR_ASSERT(noOfMacroEntries <= MacroTileTableSize);
+
+    memset(m_macroTileTable, 0, sizeof(m_macroTileTable));
+
+    if (noOfMacroEntries != 0)
+    {
+        m_noOfMacroEntries = noOfMacroEntries;
+    }
+    else
+    {
+        m_noOfMacroEntries = MacroTileTableSize;
+    }
+
+    if (pCfg) // From Client
+    {
+        for (UINT_32 i = 0; i < m_noOfMacroEntries; i++)
+        {
+            ReadGbMacroTileCfg(*(pCfg + i), &m_macroTileTable[i]);
+
+            m_macroTileTable[i].tileSplitBytes = 64 << (i % 8);
+        }
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        initOk = FALSE;
+    }
+    return initOk;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeMacroModeIndex
+*
+*   @brief
+*       Computes macro tile mode index
+*   @return
+*       TRUE if macro tile table is correctly initialized
+***************************************************************************************************
+*/
+INT_32 CIAddrLib::HwlComputeMacroModeIndex(
+    INT_32              tileIndex,      ///< [in] Tile mode index
+    ADDR_SURFACE_FLAGS  flags,          ///< [in] Surface flags
+    UINT_32             bpp,            ///< [in] Bit per pixel
+    UINT_32             numSamples,     ///< [in] Number of samples
+    ADDR_TILEINFO*      pTileInfo,      ///< [out] Pointer to ADDR_TILEINFO
+    AddrTileMode*       pTileMode,      ///< [out] Pointer to AddrTileMode
+    AddrTileType*       pTileType       ///< [out] Pointer to AddrTileType
+    ) const
+{
+    INT_32 macroModeIndex = TileIndexInvalid;
+
+    if (flags.tcCompatible && flags.stencil)
+    {
+        // Don't compute macroModeIndex for tc compatible stencil surface
+        macroModeIndex = TileIndexNoMacroIndex;
+    }
+    else
+    {
+        AddrTileMode tileMode = m_tileTable[tileIndex].mode;
+        AddrTileType tileType = m_tileTable[tileIndex].type;
+        UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+        if (!IsMacroTiled(tileMode))
+        {
+            *pTileInfo = m_tileTable[tileIndex].info;
+            macroModeIndex = TileIndexNoMacroIndex;
+        }
+        else
+        {
+            UINT_32 tileBytes1x = BITS_TO_BYTES(bpp * MicroTilePixels * thickness);
+            UINT_32 tileSplit;
+
+            if (m_tileTable[tileIndex].type == ADDR_DEPTH_SAMPLE_ORDER)
+            {
+                // Depth entries store real tileSplitBytes
+                tileSplit = m_tileTable[tileIndex].info.tileSplitBytes;
+            }
+            else
+            {
+                // Non-depth entries store a split factor
+                UINT_32 sampleSplit = m_tileTable[tileIndex].info.tileSplitBytes;
+                UINT_32 colorTileSplit = Max(256u, sampleSplit * tileBytes1x);
+
+                tileSplit = colorTileSplit;
+            }
+
+            UINT_32 tileSplitC = Min(m_rowSize, tileSplit);
+            UINT_32 tileBytes;
+
+            if (flags.fmask)
+            {
+                tileBytes = Min(tileSplitC, tileBytes1x);
+            }
+            else
+            {
+                tileBytes = Min(tileSplitC, numSamples * tileBytes1x);
+            }
+
+            if (tileBytes < 64)
+            {
+                tileBytes = 64;
+            }
+
+            macroModeIndex = Log2(tileBytes / 64);
+
+            if (flags.prt || IsPrtTileMode(tileMode))
+            {
+                // Unknown - assume it is 1/2 of table size
+                const UINT_32 PrtMacroModeOffset = MacroTileTableSize / 2;
+
+                macroModeIndex += PrtMacroModeOffset;
+                *pTileInfo = m_macroTileTable[macroModeIndex];
+            }
+            else
+            {
+                *pTileInfo = m_macroTileTable[macroModeIndex];
+            }
+
+            pTileInfo->pipeConfig = m_tileTable[tileIndex].info.pipeConfig;
+
+            if (m_tileTable[tileIndex].type != ADDR_DEPTH_SAMPLE_ORDER)
+            {
+                pTileInfo->tileSplitBytes = tileSplitC;
+            }
+            else
+            {
+                pTileInfo->tileSplitBytes = m_tileTable[tileIndex].info.tileSplitBytes;
+            }
+        }
+
+        if (NULL != pTileMode)
+        {
+            *pTileMode = tileMode;
+        }
+
+        if (NULL != pTileType)
+        {
+            *pTileType = tileType;
+        }
+    }
+
+    return macroModeIndex;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeTileDataWidthAndHeightLinear
+*
+*   @brief
+*       Compute the squared cache shape for per-tile data (CMASK and HTILE) for linear layout
+*
+*   @return
+*       N/A
+*
+*   @note
+*       MacroWidth and macroHeight are measured in pixels
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlComputeTileDataWidthAndHeightLinear(
+    UINT_32*        pMacroWidth,     ///< [out] macro tile width
+    UINT_32*        pMacroHeight,    ///< [out] macro tile height
+    UINT_32         bpp,             ///< [in] bits per pixel
+    ADDR_TILEINFO*  pTileInfo        ///< [in] tile info
+    ) const
+{
+    ADDR_ASSERT(pTileInfo != NULL);
+
+    UINT_32 numTiles;
+
+    switch (pTileInfo->pipeConfig)
+    {
+        case ADDR_PIPECFG_P16_32x32_8x16:
+        case ADDR_PIPECFG_P16_32x32_16x16:
+        case ADDR_PIPECFG_P8_32x64_32x32:
+        case ADDR_PIPECFG_P8_32x32_16x32:
+        case ADDR_PIPECFG_P8_32x32_16x16:
+        case ADDR_PIPECFG_P8_32x32_8x16:
+        case ADDR_PIPECFG_P4_32x32:
+            numTiles = 8;
+            break;
+        default:
+            numTiles = 4;
+            break;
+    }
+
+    *pMacroWidth    = numTiles * MicroTileWidth;
+    *pMacroHeight   = numTiles * MicroTileHeight;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlStereoCheckRightOffsetPadding
+*
+*   @brief
+*       check if the height needs extra padding for stereo right eye offset, to avoid swizzling
+*
+*   @return
+*       TRUE is the extra padding is needed
+*
+*   @note
+*       Kalindi (Kabini) is the only one that needs this padding as there is a uncertain
+*       possible HW issue where the right eye displays incorrectly with some type of swizzles, if
+*       the right eye offset is not 64KB aligned - EPR#366461
+*       Other Kaveri APUs also need the padding according to DXX team's report otherwise
+*       corruption observed. - EPR#374788
+***************************************************************************************************
+*/
+BOOL_32 CIAddrLib::HwlStereoCheckRightOffsetPadding() const
+{
+    BOOL_32 bNeedPadding = FALSE;
+
+    if (m_settings.isKaveri)
+    {
+        bNeedPadding = TRUE;
+    }
+
+    return bNeedPadding;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlComputeMetadataNibbleAddress
+*
+*   @brief
+*        calculate meta data address based on input information
+*
+*   &parameter
+*        uncompressedDataByteAddress - address of a pixel in color surface
+*        dataBaseByteAddress         - base address of color surface
+*        metadataBaseByteAddress     - base address of meta ram
+*        metadataBitSize             - meta key size, 8 for DCC, 4 for cmask
+*        elementBitSize              - element size of color surface
+*        blockByteSize               - compression block size, 256 for DCC
+*        pipeInterleaveBytes         - pipe interleave size
+*        numOfPipes                  - number of pipes
+*        numOfBanks                  - number of banks
+*        numOfSamplesPerSplit        - number of samples per tile split
+*   @return
+*        meta data nibble address (nibble address is used to support DCC compatible cmask)
+*
+***************************************************************************************************
+*/
+UINT_64 CIAddrLib::HwlComputeMetadataNibbleAddress(
+    UINT_64 uncompressedDataByteAddress,
+    UINT_64 dataBaseByteAddress,
+    UINT_64 metadataBaseByteAddress,
+    UINT_32 metadataBitSize,
+    UINT_32 elementBitSize,
+    UINT_32 blockByteSize,
+    UINT_32 pipeInterleaveBytes,
+    UINT_32 numOfPipes,
+    UINT_32 numOfBanks,
+    UINT_32 numOfSamplesPerSplit) const
+{
+    ///--------------------------------------------------------------------------------------------
+    /// Get pipe interleave, bank and pipe bits
+    ///--------------------------------------------------------------------------------------------
+    UINT_32 pipeInterleaveBits  = Log2(pipeInterleaveBytes);
+    UINT_32 pipeBits            = Log2(numOfPipes);
+    UINT_32 bankBits            = Log2(numOfBanks);
+
+    ///--------------------------------------------------------------------------------------------
+    /// Clear pipe and bank swizzles
+    ///--------------------------------------------------------------------------------------------
+    UINT_32 dataMacrotileBits        = pipeInterleaveBits + pipeBits + bankBits;
+    UINT_32 metadataMacrotileBits    = pipeInterleaveBits + pipeBits + bankBits;
+
+    UINT_64 dataMacrotileClearMask     = ~((1L << dataMacrotileBits) - 1);
+    UINT_64 metadataMacrotileClearMask = ~((1L << metadataMacrotileBits) - 1);
+
+    UINT_64 dataBaseByteAddressNoSwizzle = dataBaseByteAddress & dataMacrotileClearMask;
+    UINT_64 metadataBaseByteAddressNoSwizzle = metadataBaseByteAddress & metadataMacrotileClearMask;
+
+    ///--------------------------------------------------------------------------------------------
+    /// Modify metadata base before adding in so that when final address is divided by data ratio,
+    /// the base address returns to where it should be
+    ///--------------------------------------------------------------------------------------------
+    ADDR_ASSERT((0 != metadataBitSize));
+    UINT_64 metadataBaseShifted = metadataBaseByteAddressNoSwizzle * blockByteSize * 8 /
+                                  metadataBitSize;
+    UINT_64 offset = uncompressedDataByteAddress -
+                     dataBaseByteAddressNoSwizzle +
+                     metadataBaseShifted;
+
+    ///--------------------------------------------------------------------------------------------
+    /// Save bank data bits
+    ///--------------------------------------------------------------------------------------------
+    UINT_32 lsb = pipeBits + pipeInterleaveBits;
+    UINT_32 msb = bankBits - 1 + lsb;
+
+    UINT_64 bankDataBits = AddrGetBits(offset, msb, lsb);
+
+    ///--------------------------------------------------------------------------------------------
+    /// Save pipe data bits
+    ///--------------------------------------------------------------------------------------------
+    lsb = pipeInterleaveBits;
+    msb = pipeBits - 1 + lsb;
+
+    UINT_64 pipeDataBits = AddrGetBits(offset, msb, lsb);
+
+    ///--------------------------------------------------------------------------------------------
+    /// Remove pipe and bank bits
+    ///--------------------------------------------------------------------------------------------
+    lsb = pipeInterleaveBits;
+    msb = dataMacrotileBits - 1;
+
+    UINT_64 offsetWithoutPipeBankBits = AddrRemoveBits(offset, msb, lsb);
+
+    ADDR_ASSERT((0 != blockByteSize));
+    UINT_64 blockInBankpipe = offsetWithoutPipeBankBits / blockByteSize;
+
+    UINT_32 tileSize = 8 * 8 * elementBitSize/8 * numOfSamplesPerSplit;
+    UINT_32 blocksInTile = tileSize / blockByteSize;
+
+    if (0 == blocksInTile)
+    {
+        lsb = 0;
+    }
+    else
+    {
+        lsb = Log2(blocksInTile);
+    }
+    msb = bankBits - 1 + lsb;
+
+    UINT_64 blockInBankpipeWithBankBits = AddrInsertBits(blockInBankpipe, bankDataBits, msb, lsb);
+
+    /// NOTE *2 because we are converting to Nibble address in this step
+    UINT_64 metaAddressInPipe = blockInBankpipeWithBankBits * 2 * metadataBitSize / 8;
+
+
+    ///--------------------------------------------------------------------------------------------
+    /// Reinsert pipe bits back into the final address
+    ///--------------------------------------------------------------------------------------------
+    lsb = pipeInterleaveBits + 1; ///<+1 due to Nibble address now gives interleave bits extra lsb.
+    msb = pipeBits - 1 + lsb;
+    UINT_64 metadataAddress = AddrInsertBits(metaAddressInPipe, pipeDataBits, msb, lsb);
+
+    return metadataAddress;
+}
+
+/**
+***************************************************************************************************
+*   CIAddrLib::HwlPadDimensions
+*
+*   @brief
+*       Helper function to pad dimensions
+*
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID CIAddrLib::HwlPadDimensions(
+    AddrTileMode        tileMode,    ///< [in] tile mode
+    UINT_32             bpp,         ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,       ///< [in] surface flags
+    UINT_32             numSamples,  ///< [in] number of samples
+    ADDR_TILEINFO*      pTileInfo,   ///< [in/out] bank structure.
+    UINT_32             padDims,     ///< [in] Dimensions to pad valid value 1,2,3
+    UINT_32             mipLevel,    ///< [in] MipLevel
+    UINT_32*            pPitch,      ///< [in/out] pitch in pixels
+    UINT_32             pitchAlign,  ///< [in] pitch alignment
+    UINT_32*            pHeight,     ///< [in/out] height in pixels
+    UINT_32             heightAlign, ///< [in] height alignment
+    UINT_32*            pSlices,     ///< [in/out] number of slices
+    UINT_32             sliceAlign   ///< [in] number of slice alignment
+    ) const
+{
+    if (m_settings.isVolcanicIslands &&
+        flags.dccCompatible &&
+        (numSamples > 1) &&
+        (mipLevel == 0) &&
+        IsMacroTiled(tileMode))
+    {
+        UINT_32 tileSizePerSample = BITS_TO_BYTES(bpp * MicroTileWidth * MicroTileHeight);
+        UINT_32 samplesPerSplit  = pTileInfo->tileSplitBytes / tileSizePerSample;
+
+        if (samplesPerSplit < numSamples)
+        {
+            UINT_32 dccFastClearByteAlign = HwlGetPipes(pTileInfo) * m_pipeInterleaveBytes * 256;
+            UINT_32 bytesPerSplit = BITS_TO_BYTES((*pPitch) * (*pHeight) * bpp * samplesPerSplit);
+
+            ADDR_ASSERT(IsPow2(dccFastClearByteAlign));
+
+            if (0 != (bytesPerSplit & (dccFastClearByteAlign - 1)))
+            {
+                UINT_32 dccFastClearPixelAlign = dccFastClearByteAlign /
+                                                BITS_TO_BYTES(bpp) /
+                                                samplesPerSplit;
+                UINT_32 macroTilePixelAlign = pitchAlign * heightAlign;
+
+                if ((dccFastClearPixelAlign >= macroTilePixelAlign) &&
+                    ((dccFastClearPixelAlign % macroTilePixelAlign) == 0))
+                {
+                    UINT_32 dccFastClearPitchAlignInMacroTile =
+                        dccFastClearPixelAlign / macroTilePixelAlign;
+                    UINT_32 heightInMacroTile = *pHeight / heightAlign;
+                    UINT_32 dccFastClearPitchAlignInPixels;
+
+                    while ((heightInMacroTile > 1) &&
+                           ((heightInMacroTile % 2) == 0) &&
+                           (dccFastClearPitchAlignInMacroTile > 1) &&
+                           ((dccFastClearPitchAlignInMacroTile % 2) == 0))
+                    {
+                        heightInMacroTile >>= 1;
+                        dccFastClearPitchAlignInMacroTile >>= 1;
+                    }
+
+                    dccFastClearPitchAlignInPixels = pitchAlign * dccFastClearPitchAlignInMacroTile;
+
+                    if (IsPow2(dccFastClearPitchAlignInPixels))
+                    {
+                        *pPitch = PowTwoAlign((*pPitch), dccFastClearPitchAlignInPixels);
+                    }
+                    else
+                    {
+                        *pPitch += (dccFastClearPitchAlignInPixels - 1);
+                        *pPitch /= dccFastClearPitchAlignInPixels;
+                        *pPitch *= dccFastClearPitchAlignInPixels;
+                    }
+                }
+            }
+        }
+    }
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
new file mode 100644
index 00000000000..451508619f9
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  ciaddrlib.h
+* @brief Contains the CIAddrLib class definition.
+***************************************************************************************************
+*/
+
+#ifndef __CI_ADDR_LIB_H__
+#define __CI_ADDR_LIB_H__
+
+#include "addrlib.h"
+#include "siaddrlib.h"
+
+/**
+***************************************************************************************************
+* @brief CI specific settings structure.
+***************************************************************************************************
+*/
+struct CIChipSettings
+{
+    struct
+    {
+        UINT_32 isSeaIsland : 1;
+        UINT_32 isBonaire   : 1;
+        UINT_32 isKaveri    : 1;
+        UINT_32 isSpectre   : 1;
+        UINT_32 isSpooky    : 1;
+        UINT_32 isKalindi   : 1;
+        // Hawaii is GFXIP 7.2, similar with CI (Bonaire)
+        UINT_32 isHawaii    : 1;
+
+        // VI
+        UINT_32 isVolcanicIslands : 1;
+        UINT_32 isIceland         : 1;
+        UINT_32 isTonga           : 1;
+        UINT_32 isFiji            : 1;
+        // VI fusion (Carrizo)
+        UINT_32 isCarrizo         : 1;
+    };
+};
+
+/**
+***************************************************************************************************
+* @brief This class is the CI specific address library
+*        function set.
+***************************************************************************************************
+*/
+class CIAddrLib : public SIAddrLib
+{
+public:
+    /// Creates CIAddrLib object
+    static AddrLib* CreateObj(const AddrClient* pClient)
+    {
+        return new(pClient) CIAddrLib(pClient);
+    }
+
+private:
+    CIAddrLib(const AddrClient* pClient);
+    virtual ~CIAddrLib();
+
+protected:
+
+    // Hwl interface - defined in AddrLib
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut);
+
+    virtual AddrChipFamily HwlConvertChipFamily(
+        UINT_32 uChipFamily, UINT_32 uChipRevision);
+
+    virtual BOOL_32 HwlInitGlobalParams(
+        const ADDR_CREATE_INPUT* pCreateIn);
+
+    virtual ADDR_E_RETURNCODE HwlSetupTileCfg(
+        INT_32 index, INT_32 macroModeIndex, ADDR_TILEINFO* pInfo,
+        AddrTileMode* pMode = 0, AddrTileType* pType = 0) const;
+
+    virtual VOID HwlComputeTileDataWidthAndHeightLinear(
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight,
+        UINT_32 bpp, ADDR_TILEINFO* pTileInfo) const;
+
+    virtual INT_32 HwlComputeMacroModeIndex(
+        INT_32 tileIndex, ADDR_SURFACE_FLAGS flags, UINT_32 bpp, UINT_32 numSamples,
+        ADDR_TILEINFO* pTileInfo, AddrTileMode* pTileMode = NULL, AddrTileType* pTileType = NULL
+        ) const;
+
+    // Sub-hwl interface - defined in EgBasedAddrLib
+    virtual VOID HwlSetupTileInfo(
+        AddrTileMode tileMode, ADDR_SURFACE_FLAGS flags,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        ADDR_TILEINFO* inputTileInfo, ADDR_TILEINFO* outputTileInfo,
+        AddrTileType inTileType, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual INT_32 HwlPostCheckTileIndex(
+        const ADDR_TILEINFO* pInfo, AddrTileMode mode, AddrTileType type,
+        INT curIndex = TileIndexInvalid) const;
+
+    virtual VOID   HwlFmaskPreThunkSurfInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pFmaskIn,
+        const ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut,
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pSurfIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut) const;
+
+    virtual VOID   HwlFmaskPostThunkSurfInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut) const;
+
+    virtual AddrTileMode HwlDegradeThickTileMode(
+        AddrTileMode baseTileMode, UINT_32 numSlices, UINT_32* pBytesPerTile) const;
+
+    virtual BOOL_32 HwlOverrideTileMode(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        AddrTileMode* pTileMode,
+        AddrTileType* pTileType) const;
+
+    virtual BOOL_32 HwlStereoCheckRightOffsetPadding() const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeDccInfo(
+        const ADDR_COMPUTE_DCCINFO_INPUT* pIn,
+        ADDR_COMPUTE_DCCINFO_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeCmaskAddrFromCoord(
+        const ADDR_COMPUTE_CMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_CMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+protected:
+    virtual VOID HwlPadDimensions(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32 numSamples, ADDR_TILEINFO* pTileInfo, UINT_32 padDims, UINT_32 mipLevel,
+        UINT_32* pPitch, UINT_32 pitchAlign, UINT_32* pHeight, UINT_32 heightAlign,
+        UINT_32* pSlices, UINT_32 sliceAlign) const;
+
+private:
+    VOID ReadGbTileMode(
+        UINT_32 regValue, ADDR_TILECONFIG* pCfg) const;
+
+    VOID ReadGbMacroTileCfg(
+        UINT_32 regValue, ADDR_TILEINFO* pCfg) const;
+
+    BOOL_32 InitTileSettingTable(
+        const UINT_32 *pSetting, UINT_32 noOfEntries);
+
+    BOOL_32 InitMacroTileCfgTable(
+        const UINT_32 *pSetting, UINT_32 noOfEntries);
+
+    UINT_64 HwlComputeMetadataNibbleAddress(
+        UINT_64 uncompressedDataByteAddress,
+        UINT_64 dataBaseByteAddress,
+        UINT_64 metadataBaseByteAddress,
+        UINT_32 metadataBitSize,
+        UINT_32 elementBitSize,
+        UINT_32 blockByteSize,
+        UINT_32 pipeInterleaveBytes,
+        UINT_32 numOfPipes,
+        UINT_32 numOfBanks,
+        UINT_32 numOfSamplesPerSplit) const;
+
+    static const UINT_32    MacroTileTableSize = 16;
+    ADDR_TILEINFO           m_macroTileTable[MacroTileTableSize];
+    UINT_32                 m_noOfMacroEntries;
+    BOOL_32                 m_allowNonDispThickModes;
+
+    CIChipSettings          m_settings;
+};
+
+#endif
+
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.cpp
new file mode 100644
index 00000000000..b1e008b8392
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.cpp
@@ -0,0 +1,4575 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  egbaddrlib.cpp
+* @brief Contains the EgBasedAddrLib class implementation
+***************************************************************************************************
+*/
+
+#include "egbaddrlib.h"
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::EgBasedAddrLib
+*
+*   @brief
+*       Constructor
+*
+*   @note
+*
+***************************************************************************************************
+*/
+EgBasedAddrLib::EgBasedAddrLib(const AddrClient* pClient) :
+    AddrLib(pClient),
+    m_ranks(0),
+    m_logicalBanks(0),
+    m_bankInterleave(1)
+{
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::~EgBasedAddrLib
+*
+*   @brief
+*       Destructor
+***************************************************************************************************
+*/
+EgBasedAddrLib::~EgBasedAddrLib()
+{
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeSurfaceInfo
+*
+*   @brief
+*       Compute surface sizes include padded pitch,height,slices,total size in bytes,
+*       meanwhile output suitable tile mode and base alignment might be changed in this
+*       call as well. Results are returned through output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::DispatchComputeSurfaceInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    AddrTileMode        tileMode      = pIn->tileMode;
+    UINT_32             bpp           = pIn->bpp;
+    UINT_32             numSamples    = pIn->numSamples;
+    UINT_32             numFrags      = ((pIn->numFrags == 0) ? numSamples : pIn->numFrags);
+    UINT_32             pitch         = pIn->width;
+    UINT_32             height        = pIn->height;
+    UINT_32             numSlices     = pIn->numSlices;
+    UINT_32             mipLevel      = pIn->mipLevel;
+    ADDR_SURFACE_FLAGS  flags         = pIn->flags;
+
+    ADDR_TILEINFO       tileInfoDef   = {0};
+    ADDR_TILEINFO*      pTileInfo     = &tileInfoDef;
+
+    UINT_32             padDims = 0;
+    BOOL_32             valid;
+
+    tileMode = DegradeLargeThickTile(tileMode, bpp);
+
+    // Only override numSamples for NI above
+    if (m_chipFamily >= ADDR_CHIP_FAMILY_NI)
+    {
+        if (numFrags != numSamples) // This means EQAA
+        {
+            // The real surface size needed is determined by number of fragments
+            numSamples = numFrags;
+        }
+
+        // Save altered numSamples in pOut
+        pOut->numSamples = numSamples;
+    }
+
+    // Caller makes sure pOut->pTileInfo is not NULL, see HwlComputeSurfaceInfo
+    ADDR_ASSERT(pOut->pTileInfo);
+
+    if (pOut->pTileInfo != NULL)
+    {
+        pTileInfo = pOut->pTileInfo;
+    }
+
+    // Set default values
+    if (pIn->pTileInfo != NULL)
+    {
+        if (pTileInfo != pIn->pTileInfo)
+        {
+            *pTileInfo = *pIn->pTileInfo;
+        }
+    }
+    else
+    {
+        memset(pTileInfo, 0, sizeof(ADDR_TILEINFO));
+    }
+
+    // For macro tile mode, we should calculate default tiling parameters
+    HwlSetupTileInfo(tileMode,
+                     flags,
+                     bpp,
+                     pitch,
+                     height,
+                     numSamples,
+                     pIn->pTileInfo,
+                     pTileInfo,
+                     pIn->tileType,
+                     pOut);
+
+    if (flags.cube)
+    {
+        if (mipLevel == 0)
+        {
+            padDims = 2;
+        }
+
+        if (numSlices == 1)
+        {
+            // This is calculating one face, remove cube flag
+            flags.cube = 0;
+        }
+    }
+
+    switch (tileMode)
+    {
+        case ADDR_TM_LINEAR_GENERAL://fall through
+        case ADDR_TM_LINEAR_ALIGNED:
+            valid = ComputeSurfaceInfoLinear(pIn, pOut, padDims);
+            break;
+
+        case ADDR_TM_1D_TILED_THIN1://fall through
+        case ADDR_TM_1D_TILED_THICK:
+            valid = ComputeSurfaceInfoMicroTiled(pIn, pOut, padDims, tileMode);
+            break;
+
+        case ADDR_TM_2D_TILED_THIN1:    //fall through
+        case ADDR_TM_2D_TILED_THICK:    //fall through
+        case ADDR_TM_3D_TILED_THIN1:    //fall through
+        case ADDR_TM_3D_TILED_THICK:    //fall through
+        case ADDR_TM_2D_TILED_XTHICK:   //fall through
+        case ADDR_TM_3D_TILED_XTHICK:   //fall through
+        case ADDR_TM_PRT_TILED_THIN1:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_TILED_THICK:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THICK://fall through
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            valid = ComputeSurfaceInfoMacroTiled(pIn, pOut, padDims, tileMode);
+            break;
+
+        default:
+            valid = FALSE;
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceInfoLinear
+*
+*   @brief
+*       Compute linear surface sizes include padded pitch, height, slices, total size in
+*       bytes, meanwhile alignments as well. Since it is linear mode, so output tile mode
+*       will not be changed here. Results are returned through output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceInfoLinear(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut,   ///< [out] Output structure
+    UINT_32                                 padDims ///< [in] Dimensions to padd
+    ) const
+{
+    UINT_32 expPitch = pIn->width;
+    UINT_32 expHeight = pIn->height;
+    UINT_32 expNumSlices = pIn->numSlices;
+
+    // No linear MSAA on real H/W, keep this for TGL
+    UINT_32 numSamples = pOut->numSamples;
+
+    const UINT_32 microTileThickness = 1;
+
+    //
+    // Compute the surface alignments.
+    //
+    ComputeSurfaceAlignmentsLinear(pIn->tileMode,
+                                   pIn->bpp,
+                                   pIn->flags,
+                                   &pOut->baseAlign,
+                                   &pOut->pitchAlign,
+                                   &pOut->heightAlign);
+
+    if ((pIn->tileMode == ADDR_TM_LINEAR_GENERAL) && pIn->flags.color && (pIn->height > 1))
+    {
+#if !ALT_TEST
+        // When linear_general surface is accessed in multiple lines, it requires 8 pixels in pitch
+        // alignment since PITCH_TILE_MAX is in unit of 8 pixels.
+        // It is OK if it is accessed per line.
+        ADDR_ASSERT((pIn->width % 8) == 0);
+#endif
+    }
+
+    pOut->depthAlign = microTileThickness;
+
+    expPitch = HwlPreHandleBaseLvl3xPitch(pIn, expPitch);
+
+    //
+    // Pad pitch and height to the required granularities.
+    //
+    PadDimensions(pIn->tileMode,
+                  pIn->bpp,
+                  pIn->flags,
+                  numSamples,
+                  pOut->pTileInfo,
+                  padDims,
+                  pIn->mipLevel,
+                  &expPitch, pOut->pitchAlign,
+                  &expHeight, pOut->heightAlign,
+                  &expNumSlices, microTileThickness);
+
+    expPitch = HwlPostHandleBaseLvl3xPitch(pIn, expPitch);
+
+    //
+    // Adjust per HWL
+    //
+
+    UINT_64 logicalSliceSize;
+
+    logicalSliceSize = HwlGetSizeAdjustmentLinear(pIn->tileMode,
+                                                  pIn->bpp,
+                                                  numSamples,
+                                                  pOut->baseAlign,
+                                                  pOut->pitchAlign,
+                                                  &expPitch,
+                                                  &expHeight,
+                                                  &pOut->heightAlign);
+
+
+    pOut->pitch = expPitch;
+    pOut->height = expHeight;
+    pOut->depth = expNumSlices;
+
+    pOut->surfSize = logicalSliceSize * expNumSlices;
+
+    pOut->tileMode = pIn->tileMode;
+
+    return TRUE;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceInfoMicroTiled
+*
+*   @brief
+*       Compute 1D/Micro Tiled surface sizes include padded pitch, height, slices, total
+*       size in bytes, meanwhile alignments as well. Results are returned through output
+*       parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceInfoMicroTiled(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut,       ///< [out] Output structure
+    UINT_32                                 padDims,    ///< [in] Dimensions to padd
+    AddrTileMode                            expTileMode ///< [in] Expected tile mode
+    ) const
+{
+    BOOL_32 valid = TRUE;
+
+    UINT_32 microTileThickness;
+    UINT_32 expPitch = pIn->width;
+    UINT_32 expHeight = pIn->height;
+    UINT_32 expNumSlices = pIn->numSlices;
+
+    // No 1D MSAA on real H/W, keep this for TGL
+    UINT_32 numSamples = pOut->numSamples;
+
+    //
+    // Compute the micro tile thickness.
+    //
+    microTileThickness = ComputeSurfaceThickness(expTileMode);
+
+    //
+    // Extra override for mip levels
+    //
+    if (pIn->mipLevel > 0)
+    {
+        //
+        // Reduce tiling mode from thick to thin if the number of slices is less than the
+        // micro tile thickness.
+        //
+        if ((expTileMode == ADDR_TM_1D_TILED_THICK) &&
+            (expNumSlices < ThickTileThickness))
+        {
+            expTileMode = HwlDegradeThickTileMode(ADDR_TM_1D_TILED_THICK, expNumSlices, NULL);
+            if (expTileMode != ADDR_TM_1D_TILED_THICK)
+            {
+                microTileThickness = 1;
+            }
+        }
+    }
+
+    //
+    // Compute the surface restrictions.
+    //
+    ComputeSurfaceAlignmentsMicroTiled(expTileMode,
+                                       pIn->bpp,
+                                       pIn->flags,
+                                       numSamples,
+                                       &pOut->baseAlign,
+                                       &pOut->pitchAlign,
+                                       &pOut->heightAlign);
+
+    pOut->depthAlign = microTileThickness;
+
+    //
+    // Pad pitch and height to the required granularities.
+    // Compute surface size.
+    // Return parameters.
+    //
+    PadDimensions(expTileMode,
+                  pIn->bpp,
+                  pIn->flags,
+                  numSamples,
+                  pOut->pTileInfo,
+                  padDims,
+                  pIn->mipLevel,
+                  &expPitch, pOut->pitchAlign,
+                  &expHeight, pOut->heightAlign,
+                  &expNumSlices, microTileThickness);
+
+    //
+    // Get HWL specific pitch adjustment
+    //
+    UINT_64 logicalSliceSize = HwlGetSizeAdjustmentMicroTiled(microTileThickness,
+                                                              pIn->bpp,
+                                                              pIn->flags,
+                                                              numSamples,
+                                                              pOut->baseAlign,
+                                                              pOut->pitchAlign,
+                                                              &expPitch,
+                                                              &expHeight);
+
+
+    pOut->pitch = expPitch;
+    pOut->height = expHeight;
+    pOut->depth = expNumSlices;
+
+    pOut->surfSize = logicalSliceSize * expNumSlices;
+
+    pOut->tileMode = expTileMode;
+
+    return valid;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceInfoMacroTiled
+*
+*   @brief
+*       Compute 2D/macro tiled surface sizes include padded pitch, height, slices, total
+*       size in bytes, meanwhile output suitable tile mode and alignments might be changed
+*       in this call as well. Results are returned through output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceInfoMacroTiled(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut,       ///< [out] Output structure
+    UINT_32                                 padDims,    ///< [in] Dimensions to padd
+    AddrTileMode                            expTileMode ///< [in] Expected tile mode
+    ) const
+{
+    BOOL_32 valid = TRUE;
+
+    AddrTileMode origTileMode = expTileMode;
+    UINT_32 microTileThickness;
+
+    UINT_32 paddedPitch;
+    UINT_32 paddedHeight;
+    UINT_64 bytesPerSlice;
+
+    UINT_32 expPitch     = pIn->width;
+    UINT_32 expHeight    = pIn->height;
+    UINT_32 expNumSlices = pIn->numSlices;
+
+    UINT_32 numSamples = pOut->numSamples;
+
+    //
+    // Compute the surface restrictions as base
+    // SanityCheckMacroTiled is called in ComputeSurfaceAlignmentsMacroTiled
+    //
+    valid = ComputeSurfaceAlignmentsMacroTiled(expTileMode,
+                                               pIn->bpp,
+                                               pIn->flags,
+                                               pIn->mipLevel,
+                                               numSamples,
+                                               pOut->pTileInfo,
+                                               &pOut->baseAlign,
+                                               &pOut->pitchAlign,
+                                               &pOut->heightAlign);
+
+    if (valid)
+    {
+        //
+        // Compute the micro tile thickness.
+        //
+        microTileThickness = ComputeSurfaceThickness(expTileMode);
+
+        //
+        // Find the correct tiling mode for mip levels
+        //
+        if (pIn->mipLevel > 0)
+        {
+            //
+            // Try valid tile mode
+            //
+            expTileMode = ComputeSurfaceMipLevelTileMode(expTileMode,
+                                                         pIn->bpp,
+                                                         expPitch,
+                                                         expHeight,
+                                                         expNumSlices,
+                                                         numSamples,
+                                                         pOut->pitchAlign,
+                                                         pOut->heightAlign,
+                                                         pOut->pTileInfo);
+
+            if (!IsMacroTiled(expTileMode)) // Downgraded to micro-tiled
+            {
+                return ComputeSurfaceInfoMicroTiled(pIn, pOut, padDims, expTileMode);
+            }
+            else
+            {
+                if (microTileThickness != ComputeSurfaceThickness(expTileMode))
+                {
+                    //
+                    // Re-compute if thickness changed since bank-height may be changed!
+                    //
+                    return ComputeSurfaceInfoMacroTiled(pIn, pOut, padDims, expTileMode);
+                }
+            }
+        }
+
+        paddedPitch     = expPitch;
+        paddedHeight    = expHeight;
+
+        //
+        // Re-cal alignment
+        //
+        if (expTileMode != origTileMode) // Tile mode is changed but still macro-tiled
+        {
+            valid = ComputeSurfaceAlignmentsMacroTiled(expTileMode,
+                                                       pIn->bpp,
+                                                       pIn->flags,
+                                                       pIn->mipLevel,
+                                                       numSamples,
+                                                       pOut->pTileInfo,
+                                                       &pOut->baseAlign,
+                                                       &pOut->pitchAlign,
+                                                       &pOut->heightAlign);
+        }
+
+        //
+        // Do padding
+        //
+        PadDimensions(expTileMode,
+                      pIn->bpp,
+                      pIn->flags,
+                      numSamples,
+                      pOut->pTileInfo,
+                      padDims,
+                      pIn->mipLevel,
+                      &paddedPitch, pOut->pitchAlign,
+                      &paddedHeight, pOut->heightAlign,
+                      &expNumSlices, microTileThickness);
+
+        if (pIn->flags.qbStereo &&
+            (pOut->pStereoInfo != NULL) &&
+            HwlStereoCheckRightOffsetPadding())
+        {
+            // Eye height's bank bits are different from y == 0?
+            // Since 3D rendering treats right eye buffer starting from y == "eye height" while
+            // display engine treats it to be 0, so the bank bits may be different, we pad
+            // more in height to make sure y == "eye height" has the same bank bits as y == 0.
+            UINT_32 checkMask = pOut->pTileInfo->banks - 1;
+            UINT_32 bankBits = 0;
+            do
+            {
+                bankBits = (paddedHeight / 8 / pOut->pTileInfo->bankHeight) & checkMask;
+
+                if (bankBits)
+                {
+                   paddedHeight += pOut->heightAlign;
+                }
+            } while (bankBits);
+        }
+
+        //
+        // Compute the size of a slice.
+        //
+        bytesPerSlice = BITS_TO_BYTES(static_cast<UINT_64>(paddedPitch) *
+                                      paddedHeight * NextPow2(pIn->bpp) * numSamples);
+
+        pOut->pitch = paddedPitch;
+        // Put this check right here to workaround special mipmap cases which the original height
+        // is needed.
+        // The original height is pre-stored in pOut->height in PostComputeMipLevel and
+        // pOut->pitch is needed in HwlCheckLastMacroTiledLvl, too.
+        if (m_configFlags.checkLast2DLevel && numSamples == 1) // Don't check MSAA
+        {
+            // Set a TRUE in pOut if next Level is the first 1D sub level
+            HwlCheckLastMacroTiledLvl(pIn, pOut);
+        }
+        pOut->height = paddedHeight;
+
+        pOut->depth = expNumSlices;
+
+        pOut->surfSize = bytesPerSlice * expNumSlices;
+
+        pOut->tileMode = expTileMode;
+
+        pOut->depthAlign = microTileThickness;
+
+    } // if (valid)
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAlignmentsLinear
+*
+*   @brief
+*       Compute linear surface alignment, calculation results are returned through
+*       output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceAlignmentsLinear(
+    AddrTileMode        tileMode,          ///< [in] tile mode
+    UINT_32             bpp,               ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,             ///< [in] surface flags
+    UINT_32*            pBaseAlign,        ///< [out] base address alignment in bytes
+    UINT_32*            pPitchAlign,       ///< [out] pitch alignment in pixels
+    UINT_32*            pHeightAlign       ///< [out] height alignment in pixels
+    ) const
+{
+    BOOL_32 valid = TRUE;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_LINEAR_GENERAL:
+            //
+            // The required base alignment and pitch and height granularities is to 1 element.
+            //
+            *pBaseAlign   = (bpp > 8) ? bpp / 8 : 1;
+            *pPitchAlign  = 1;
+            *pHeightAlign = 1;
+            break;
+        case ADDR_TM_LINEAR_ALIGNED:
+            //
+            // The required alignment for base is the pipe interleave size.
+            // The required granularity for pitch is hwl dependent.
+            // The required granularity for height is one row.
+            //
+            *pBaseAlign     = m_pipeInterleaveBytes;
+            *pPitchAlign    = HwlGetPitchAlignmentLinear(bpp, flags);
+            *pHeightAlign   = 1;
+            break;
+        default:
+            *pBaseAlign     = 1;
+            *pPitchAlign    = 1;
+            *pHeightAlign   = 1;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    AdjustPitchAlignment(flags, pPitchAlign);
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAlignmentsMicroTiled
+*
+*   @brief
+*       Compute 1D tiled surface alignment, calculation results are returned through
+*       output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceAlignmentsMicroTiled(
+    AddrTileMode        tileMode,          ///< [in] tile mode
+    UINT_32             bpp,               ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,             ///< [in] surface flags
+    UINT_32             numSamples,        ///< [in] number of samples
+    UINT_32*            pBaseAlign,        ///< [out] base address alignment in bytes
+    UINT_32*            pPitchAlign,       ///< [out] pitch alignment in pixels
+    UINT_32*            pHeightAlign       ///< [out] height alignment in pixels
+    ) const
+{
+    BOOL_32 valid = TRUE;
+
+    //
+    // The required alignment for base is the pipe interleave size.
+    //
+    *pBaseAlign   = m_pipeInterleaveBytes;
+
+    *pPitchAlign  = HwlGetPitchAlignmentMicroTiled(tileMode, bpp, flags, numSamples);
+
+    *pHeightAlign = MicroTileHeight;
+
+    AdjustPitchAlignment(flags, pPitchAlign);
+
+    // ECR#393489
+    // Workaround 2 for 1D tiling -  There is HW bug for Carrizo
+    // where it requires the following alignments for 1D tiling.
+    if (flags.czDispCompatible)
+    {
+        *pBaseAlign  = PowTwoAlign(*pBaseAlign, 4096);                         //Base address MOD 4096 = 0
+        *pPitchAlign = PowTwoAlign(*pPitchAlign, 512 >> (BITS_TO_BYTES(bpp))); //(8 lines * pitch * bytes per pixel) MOD 4096 = 0
+    }
+    // end Carrizo workaround for 1D tilling
+
+    return valid;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlReduceBankWidthHeight
+*
+*   @brief
+*       Additional checks, reduce bankHeight/bankWidth if needed and possible
+*       tileSize*BANK_WIDTH*BANK_HEIGHT <= ROW_SIZE
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::HwlReduceBankWidthHeight(
+    UINT_32             tileSize,           ///< [in] tile size
+    UINT_32             bpp,                ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,              ///< [in] surface flags
+    UINT_32             numSamples,         ///< [in] number of samples
+    UINT_32             bankHeightAlign,    ///< [in] bank height alignment
+    UINT_32             pipes,              ///< [in] pipes
+    ADDR_TILEINFO*      pTileInfo           ///< [in/out] bank structure.
+    ) const
+{
+    UINT_32 macroAspectAlign;
+    BOOL_32 valid = TRUE;
+
+    if (tileSize * pTileInfo->bankWidth * pTileInfo->bankHeight > m_rowSize)
+    {
+        BOOL_32 stillGreater = TRUE;
+
+        // Try reducing bankWidth first
+        if (stillGreater && pTileInfo->bankWidth > 1)
+        {
+            while (stillGreater && pTileInfo->bankWidth > 0)
+            {
+                pTileInfo->bankWidth >>= 1;
+
+                if (pTileInfo->bankWidth == 0)
+                {
+                    pTileInfo->bankWidth = 1;
+                    break;
+                }
+
+                stillGreater =
+                    tileSize * pTileInfo->bankWidth * pTileInfo->bankHeight > m_rowSize;
+            }
+
+            // bankWidth is reduced above, so we need to recalculate bankHeight and ratio
+            bankHeightAlign = Max(1u,
+                                  m_pipeInterleaveBytes * m_bankInterleave /
+                                  (tileSize * pTileInfo->bankWidth)
+                                  );
+
+            // We cannot increase bankHeight so just assert this case.
+            ADDR_ASSERT((pTileInfo->bankHeight % bankHeightAlign) == 0);
+
+            if (numSamples == 1)
+            {
+                macroAspectAlign = Max(1u,
+                                   m_pipeInterleaveBytes * m_bankInterleave /
+                                   (tileSize * pipes * pTileInfo->bankWidth)
+                                   );
+                pTileInfo->macroAspectRatio = PowTwoAlign(pTileInfo->macroAspectRatio,
+                                                          macroAspectAlign);
+            }
+        }
+
+        // Early quit bank_height degradation for "64" bit z buffer
+        if (flags.depth && bpp >= 64)
+        {
+            stillGreater = FALSE;
+        }
+
+        // Then try reducing bankHeight
+        if (stillGreater && pTileInfo->bankHeight > bankHeightAlign)
+        {
+            while (stillGreater && pTileInfo->bankHeight > bankHeightAlign)
+            {
+                pTileInfo->bankHeight >>= 1;
+
+                if (pTileInfo->bankHeight < bankHeightAlign)
+                {
+                    pTileInfo->bankHeight = bankHeightAlign;
+                    break;
+                }
+
+                stillGreater =
+                    tileSize * pTileInfo->bankWidth * pTileInfo->bankHeight > m_rowSize;
+            }
+        }
+
+        valid = !stillGreater;
+
+        // Generate a warning if we still fail to meet this constraint
+        if (!valid)
+        {
+            ADDR_WARN(
+                0, ("TILE_SIZE(%d)*BANK_WIDTH(%d)*BANK_HEIGHT(%d) <= ROW_SIZE(%d)",
+                tileSize, pTileInfo->bankWidth, pTileInfo->bankHeight, m_rowSize));
+        }
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAlignmentsMacroTiled
+*
+*   @brief
+*       Compute 2D tiled surface alignment, calculation results are returned through
+*       output parameters.
+*
+*   @return
+*       TRUE if no error occurs
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::ComputeSurfaceAlignmentsMacroTiled(
+    AddrTileMode        tileMode,           ///< [in] tile mode
+    UINT_32             bpp,                ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,              ///< [in] surface flags
+    UINT_32             mipLevel,           ///< [in] mip level
+    UINT_32             numSamples,         ///< [in] number of samples
+    ADDR_TILEINFO*      pTileInfo,          ///< [in/out] bank structure.
+    UINT_32*            pBaseAlign,         ///< [out] base address alignment in bytes
+    UINT_32*            pPitchAlign,        ///< [out] pitch alignment in pixels
+    UINT_32*            pHeightAlign        ///< [out] height alignment in pixels
+    ) const
+{
+    BOOL_32 valid = SanityCheckMacroTiled(pTileInfo);
+
+    if (valid)
+    {
+        UINT_32 macroTileWidth;
+        UINT_32 macroTileHeight;
+
+        UINT_32 tileSize;
+        UINT_32 bankHeightAlign;
+        UINT_32 macroAspectAlign;
+
+        UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+        UINT_32 pipes = HwlGetPipes(pTileInfo);
+
+        //
+        // Align bank height first according to latest h/w spec
+        //
+
+        // tile_size = MIN(tile_split, 64 * tile_thickness * element_bytes * num_samples)
+        tileSize = Min(pTileInfo->tileSplitBytes,
+                       BITS_TO_BYTES(64 * thickness * bpp * numSamples));
+
+        // bank_height_align =
+        // MAX(1, (pipe_interleave_bytes * bank_interleave)/(tile_size*bank_width))
+        bankHeightAlign = Max(1u,
+                              m_pipeInterleaveBytes * m_bankInterleave /
+                              (tileSize * pTileInfo->bankWidth)
+                              );
+
+        pTileInfo->bankHeight = PowTwoAlign(pTileInfo->bankHeight, bankHeightAlign);
+
+        // num_pipes * bank_width * macro_tile_aspect >=
+        // (pipe_interleave_size * bank_interleave) / tile_size
+        if (numSamples == 1)
+        {
+            // this restriction is only for mipmap (mipmap's numSamples must be 1)
+            macroAspectAlign = Max(1u,
+                               m_pipeInterleaveBytes * m_bankInterleave /
+                               (tileSize * pipes * pTileInfo->bankWidth)
+                               );
+            pTileInfo->macroAspectRatio = PowTwoAlign(pTileInfo->macroAspectRatio, macroAspectAlign);
+        }
+
+        valid = HwlReduceBankWidthHeight(tileSize,
+                                      bpp,
+                                      flags,
+                                      numSamples,
+                                      bankHeightAlign,
+                                      pipes,
+                                      pTileInfo);
+
+        //
+        // The required granularity for pitch is the macro tile width.
+        //
+        macroTileWidth = MicroTileWidth * pTileInfo->bankWidth * pipes *
+            pTileInfo->macroAspectRatio;
+
+        *pPitchAlign = macroTileWidth;
+
+        AdjustPitchAlignment(flags, pPitchAlign);
+
+        //
+        // The required granularity for height is the macro tile height.
+        //
+        macroTileHeight = MicroTileHeight * pTileInfo->bankHeight * pTileInfo->banks /
+            pTileInfo->macroAspectRatio;
+
+        *pHeightAlign = macroTileHeight;
+
+        //
+        // Compute base alignment
+        //
+        *pBaseAlign = pipes *
+            pTileInfo->bankWidth * pTileInfo->banks * pTileInfo->bankHeight * tileSize;
+
+        if ((mipLevel == 0) && (flags.prt) && (m_chipFamily == ADDR_CHIP_FAMILY_SI))
+        {
+            static const UINT_32 PrtTileSize = 0x10000;
+
+            UINT_32 macroTileSize = macroTileWidth * macroTileHeight * numSamples * bpp / 8;
+
+            if (macroTileSize < PrtTileSize)
+            {
+                UINT_32 numMacroTiles = PrtTileSize / macroTileSize;
+
+                ADDR_ASSERT((PrtTileSize % macroTileSize) == 0);
+
+                *pPitchAlign *= numMacroTiles;
+                *pBaseAlign  *= numMacroTiles;
+            }
+        }
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::SanityCheckMacroTiled
+*
+*   @brief
+*       Check if macro-tiled parameters are valid
+*   @return
+*       TRUE if valid
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::SanityCheckMacroTiled(
+    ADDR_TILEINFO* pTileInfo   ///< [in] macro-tiled parameters
+    ) const
+{
+    BOOL_32 valid       = TRUE;
+    UINT_32 numPipes    = HwlGetPipes(pTileInfo);
+
+    switch (pTileInfo->banks)
+    {
+        case 2: //fall through
+        case 4: //fall through
+        case 8: //fall through
+        case 16:
+            break;
+        default:
+            valid = FALSE;
+            break;
+
+    }
+
+    if (valid)
+    {
+        switch (pTileInfo->bankWidth)
+        {
+            case 1: //fall through
+            case 2: //fall through
+            case 4: //fall through
+            case 8:
+                break;
+            default:
+                valid = FALSE;
+                break;
+        }
+    }
+
+    if (valid)
+    {
+        switch (pTileInfo->bankHeight)
+        {
+            case 1: //fall through
+            case 2: //fall through
+            case 4: //fall through
+            case 8:
+                break;
+            default:
+                valid = FALSE;
+                break;
+        }
+    }
+
+    if (valid)
+    {
+        switch (pTileInfo->macroAspectRatio)
+        {
+            case 1: //fall through
+            case 2: //fall through
+            case 4: //fall through
+            case 8:
+                break;
+            default:
+                valid = FALSE;
+                break;
+        }
+    }
+
+    if (valid)
+    {
+        if (pTileInfo->banks < pTileInfo->macroAspectRatio)
+        {
+            // This will generate macro tile height <= 1
+            valid = FALSE;
+        }
+    }
+
+    if (valid)
+    {
+        if (pTileInfo->tileSplitBytes > m_rowSize)
+        {
+            valid = FALSE;
+        }
+    }
+
+    if (valid)
+    {
+        valid = HwlSanityCheckMacroTiled(pTileInfo);
+    }
+
+    ADDR_ASSERT(valid == TRUE);
+
+    // Add this assert for guidance
+    ADDR_ASSERT(numPipes * pTileInfo->banks >= 4);
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceMipLevelTileMode
+*
+*   @brief
+*       Compute valid tile mode for surface mipmap sub-levels
+*
+*   @return
+*       Suitable tile mode
+***************************************************************************************************
+*/
+AddrTileMode EgBasedAddrLib::ComputeSurfaceMipLevelTileMode(
+    AddrTileMode        baseTileMode,   ///< [in] base tile mode
+    UINT_32             bpp,            ///< [in] bits per pixels
+    UINT_32             pitch,          ///< [in] current level pitch
+    UINT_32             height,         ///< [in] current level height
+    UINT_32             numSlices,      ///< [in] current number of slices
+    UINT_32             numSamples,     ///< [in] number of samples
+    UINT_32             pitchAlign,     ///< [in] pitch alignment
+    UINT_32             heightAlign,    ///< [in] height alignment
+    ADDR_TILEINFO*      pTileInfo       ///< [in] ptr to bank structure
+    ) const
+{
+    UINT_32 bytesPerTile;
+
+    AddrTileMode expTileMode = baseTileMode;
+    UINT_32 microTileThickness = ComputeSurfaceThickness(expTileMode);
+    UINT_32 interleaveSize = m_pipeInterleaveBytes * m_bankInterleave;
+
+    //
+    // Compute the size of a slice.
+    //
+    bytesPerTile = BITS_TO_BYTES(MicroTilePixels * microTileThickness * NextPow2(bpp) * numSamples);
+
+    //
+    // Reduce tiling mode from thick to thin if the number of slices is less than the
+    // micro tile thickness.
+    //
+    if (numSlices < microTileThickness)
+    {
+        expTileMode = HwlDegradeThickTileMode(expTileMode, numSlices, &bytesPerTile);
+    }
+
+    if (bytesPerTile > pTileInfo->tileSplitBytes)
+    {
+        bytesPerTile = pTileInfo->tileSplitBytes;
+    }
+
+    UINT_32 threshold1 =
+        bytesPerTile * HwlGetPipes(pTileInfo) * pTileInfo->bankWidth * pTileInfo->macroAspectRatio;
+
+    UINT_32 threshold2 =
+        bytesPerTile * pTileInfo->bankWidth * pTileInfo->bankHeight;
+
+    //
+    // Reduce the tile mode from 2D/3D to 1D in following conditions
+    //
+    switch (expTileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1: //fall through
+        case ADDR_TM_3D_TILED_THIN1:
+        case ADDR_TM_PRT_TILED_THIN1:
+        case ADDR_TM_PRT_2D_TILED_THIN1:
+        case ADDR_TM_PRT_3D_TILED_THIN1:
+            if ((pitch < pitchAlign) ||
+                (height < heightAlign) ||
+                (interleaveSize > threshold1) ||
+                (interleaveSize > threshold2))
+            {
+                expTileMode = ADDR_TM_1D_TILED_THIN1;
+            }
+            break;
+        case ADDR_TM_2D_TILED_THICK: //fall through
+        case ADDR_TM_3D_TILED_THICK:
+        case ADDR_TM_2D_TILED_XTHICK:
+        case ADDR_TM_3D_TILED_XTHICK:
+        case ADDR_TM_PRT_TILED_THICK:
+        case ADDR_TM_PRT_2D_TILED_THICK:
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            if ((pitch < pitchAlign) ||
+                (height < heightAlign))
+            {
+                expTileMode = ADDR_TM_1D_TILED_THICK;
+            }
+            break;
+        default:
+            break;
+    }
+
+    return expTileMode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlDegradeBaseLevel
+*   @brief
+*       Check if degrade is needed for base level
+*   @return
+*       TRUE if degrade is suggested
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::HwlDegradeBaseLevel(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const
+{
+    BOOL_32 degrade = FALSE;
+    BOOL_32 valid = TRUE;
+
+    ADDR_ASSERT(IsMacroTiled(pIn->tileMode));
+
+    UINT_32 baseAlign;
+    UINT_32 pitchAlign;
+    UINT_32 heightAlign;
+
+    ADDR_ASSERT(pIn->pTileInfo);
+    ADDR_TILEINFO tileInfo = *pIn->pTileInfo;
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
+
+    if (UseTileIndex(pIn->tileIndex))
+    {
+        out.tileIndex = pIn->tileIndex;
+        out.macroModeIndex = TileIndexInvalid;
+    }
+
+    HwlSetupTileInfo(pIn->tileMode,
+                     pIn->flags,
+                     pIn->bpp,
+                     pIn->width,
+                     pIn->height,
+                     pIn->numSamples,
+                     &tileInfo,
+                     &tileInfo,
+                     pIn->tileType,
+                     &out);
+
+    valid = ComputeSurfaceAlignmentsMacroTiled(pIn->tileMode,
+                                               pIn->bpp,
+                                               pIn->flags,
+                                               pIn->mipLevel,
+                                               pIn->numSamples,
+                                               &tileInfo,
+                                               &baseAlign,
+                                               &pitchAlign,
+                                               &heightAlign);
+
+    if (valid)
+    {
+        degrade = (pIn->width < pitchAlign || pIn->height < heightAlign);
+    }
+    else
+    {
+        degrade = TRUE;
+    }
+
+    return degrade;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlDegradeThickTileMode
+*
+*   @brief
+*       Degrades valid tile mode for thick modes if needed
+*
+*   @return
+*       Suitable tile mode
+***************************************************************************************************
+*/
+AddrTileMode EgBasedAddrLib::HwlDegradeThickTileMode(
+    AddrTileMode        baseTileMode,   ///< [in] base tile mode
+    UINT_32             numSlices,      ///< [in] current number of slices
+    UINT_32*            pBytesPerTile   ///< [in/out] pointer to bytes per slice
+    ) const
+{
+    ADDR_ASSERT(numSlices < ComputeSurfaceThickness(baseTileMode));
+    // if pBytesPerTile is NULL, this is a don't-care....
+    UINT_32 bytesPerTile = pBytesPerTile != NULL ? *pBytesPerTile : 64;
+
+    AddrTileMode expTileMode = baseTileMode;
+    switch (baseTileMode)
+    {
+        case ADDR_TM_1D_TILED_THICK:
+            expTileMode = ADDR_TM_1D_TILED_THIN1;
+            bytesPerTile >>= 2;
+            break;
+        case ADDR_TM_2D_TILED_THICK:
+            expTileMode = ADDR_TM_2D_TILED_THIN1;
+            bytesPerTile >>= 2;
+            break;
+        case ADDR_TM_3D_TILED_THICK:
+            expTileMode = ADDR_TM_3D_TILED_THIN1;
+            bytesPerTile >>= 2;
+            break;
+        case ADDR_TM_2D_TILED_XTHICK:
+            if (numSlices < ThickTileThickness)
+            {
+                expTileMode = ADDR_TM_2D_TILED_THIN1;
+                bytesPerTile >>= 3;
+            }
+            else
+            {
+                expTileMode = ADDR_TM_2D_TILED_THICK;
+                bytesPerTile >>= 1;
+            }
+            break;
+        case ADDR_TM_3D_TILED_XTHICK:
+            if (numSlices < ThickTileThickness)
+            {
+                expTileMode = ADDR_TM_3D_TILED_THIN1;
+                bytesPerTile >>= 3;
+            }
+            else
+            {
+                expTileMode = ADDR_TM_3D_TILED_THICK;
+                bytesPerTile >>= 1;
+            }
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+    if (pBytesPerTile != NULL)
+    {
+        *pBytesPerTile = bytesPerTile;
+    }
+
+    return expTileMode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeSurfaceAddrFromCoord
+*
+*   @brief
+*       Compute surface address from given coord (x, y, slice,sample)
+*
+*   @return
+*       Address in bytes
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::DispatchComputeSurfaceAddrFromCoord(
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    UINT_32             x                  = pIn->x;
+    UINT_32             y                  = pIn->y;
+    UINT_32             slice              = pIn->slice;
+    UINT_32             sample             = pIn->sample;
+    UINT_32             bpp                = pIn->bpp;
+    UINT_32             pitch              = pIn->pitch;
+    UINT_32             height             = pIn->height;
+    UINT_32             numSlices          = pIn->numSlices;
+    UINT_32             numSamples         = ((pIn->numSamples == 0) ? 1 : pIn->numSamples);
+    UINT_32             numFrags           = ((pIn->numFrags == 0) ? numSamples : pIn->numFrags);
+    AddrTileMode        tileMode           = pIn->tileMode;
+    AddrTileType        microTileType      = pIn->tileType;
+    BOOL_32             ignoreSE           = pIn->ignoreSE;
+    BOOL_32             isDepthSampleOrder = pIn->isDepth;
+    ADDR_TILEINFO*      pTileInfo          = pIn->pTileInfo;
+
+    UINT_32*            pBitPosition       = &pOut->bitPosition;
+    UINT_64             addr;
+
+#if ADDR_AM_BUILD
+    UINT_32             addr5Bit           = 0;
+    UINT_32             addr5Swizzle       = pIn->addr5Swizzle;
+    BOOL_32             is32ByteTile       = pIn->is32ByteTile;
+#endif
+
+    // ADDR_DEPTH_SAMPLE_ORDER = non-disp + depth-sample-order
+    if (microTileType == ADDR_DEPTH_SAMPLE_ORDER)
+    {
+        isDepthSampleOrder = TRUE;
+    }
+
+    if (m_chipFamily >= ADDR_CHIP_FAMILY_NI)
+    {
+        if (numFrags != numSamples)
+        {
+            numSamples = numFrags;
+            ADDR_ASSERT(sample < numSamples);
+        }
+
+        /// @note
+        /// 128 bit/thick tiled surface doesn't support display tiling and
+        /// mipmap chain must have the same tileType, so please fill tileType correctly
+        if (!IsLinear(pIn->tileMode))
+        {
+            if (bpp >= 128 || ComputeSurfaceThickness(tileMode) > 1)
+            {
+                ADDR_ASSERT(microTileType != ADDR_DISPLAYABLE);
+            }
+        }
+    }
+
+    switch (tileMode)
+    {
+        case ADDR_TM_LINEAR_GENERAL://fall through
+        case ADDR_TM_LINEAR_ALIGNED:
+            addr = ComputeSurfaceAddrFromCoordLinear(x,
+                                                     y,
+                                                     slice,
+                                                     sample,
+                                                     bpp,
+                                                     pitch,
+                                                     height,
+                                                     numSlices,
+                                                     pBitPosition);
+            break;
+        case ADDR_TM_1D_TILED_THIN1://fall through
+        case ADDR_TM_1D_TILED_THICK:
+            addr = ComputeSurfaceAddrFromCoordMicroTiled(x,
+                                                         y,
+                                                         slice,
+                                                         sample,
+                                                         bpp,
+                                                         pitch,
+                                                         height,
+                                                         numSamples,
+                                                         tileMode,
+                                                         microTileType,
+                                                         isDepthSampleOrder,
+                                                         pBitPosition);
+            break;
+        case ADDR_TM_2D_TILED_THIN1:    //fall through
+        case ADDR_TM_2D_TILED_THICK:    //fall through
+        case ADDR_TM_3D_TILED_THIN1:    //fall through
+        case ADDR_TM_3D_TILED_THICK:    //fall through
+        case ADDR_TM_2D_TILED_XTHICK:   //fall through
+        case ADDR_TM_3D_TILED_XTHICK:   //fall through
+        case ADDR_TM_PRT_TILED_THIN1:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_TILED_THICK:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THICK://fall through
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            UINT_32 pipeSwizzle;
+            UINT_32 bankSwizzle;
+
+            if (m_configFlags.useCombinedSwizzle)
+            {
+                ExtractBankPipeSwizzle(pIn->tileSwizzle, pIn->pTileInfo,
+                                       &bankSwizzle, &pipeSwizzle);
+            }
+            else
+            {
+                pipeSwizzle = pIn->pipeSwizzle;
+                bankSwizzle = pIn->bankSwizzle;
+            }
+
+            addr = ComputeSurfaceAddrFromCoordMacroTiled(x,
+                                                         y,
+                                                         slice,
+                                                         sample,
+                                                         bpp,
+                                                         pitch,
+                                                         height,
+                                                         numSamples,
+                                                         tileMode,
+                                                         microTileType,
+                                                         ignoreSE,
+                                                         isDepthSampleOrder,
+                                                         pipeSwizzle,
+                                                         bankSwizzle,
+                                                         pTileInfo,
+                                                         pBitPosition);
+            break;
+        default:
+            addr = 0;
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+#if ADDR_AM_BUILD
+    if (m_chipFamily >= ADDR_CHIP_FAMILY_NI)
+    {
+        if (addr5Swizzle && isDepthSampleOrder && is32ByteTile)
+        {
+            UINT_32 tx = x >> 3;
+            UINT_32 ty = y >> 3;
+            UINT_32 tileBits = ((ty&0x3) << 2) | (tx&0x3);
+
+            tileBits = tileBits & addr5Swizzle;
+            addr5Bit = XorReduce(tileBits, 4);
+
+            addr = addr | static_cast<UINT_64>(addr5Bit << 5);
+        }
+    }
+#endif
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAddrFromCoordMicroTiled
+*
+*   @brief
+*       Computes the surface address and bit position from a
+*       coordinate for 2D tilied (macro tiled)
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeSurfaceAddrFromCoordMacroTiled(
+    UINT_32             x,                      ///< [in] x coordinate
+    UINT_32             y,                      ///< [in] y coordinate
+    UINT_32             slice,                  ///< [in] slice index
+    UINT_32             sample,                 ///< [in] sample index
+    UINT_32             bpp,                    ///< [in] bits per pixel
+    UINT_32             pitch,                  ///< [in] surface pitch, in pixels
+    UINT_32             height,                 ///< [in] surface height, in pixels
+    UINT_32             numSamples,             ///< [in] number of samples
+    AddrTileMode        tileMode,               ///< [in] tile mode
+    AddrTileType        microTileType,          ///< [in] micro tiling type
+    BOOL_32             ignoreSE,               ///< [in] TRUE if shader enginers can be ignored
+    BOOL_32             isDepthSampleOrder,     ///< [in] TRUE if it depth sample ordering is used
+    UINT_32             pipeSwizzle,            ///< [in] pipe swizzle
+    UINT_32             bankSwizzle,            ///< [in] bank swizzle
+    ADDR_TILEINFO*      pTileInfo,              ///< [in] bank structure
+                                                ///  **All fields to be valid on entry**
+    UINT_32*            pBitPosition            ///< [out] bit position, e.g. FMT_1 will use this
+    ) const
+{
+    UINT_64 addr;
+
+    UINT_32 microTileBytes;
+    UINT_32 microTileBits;
+    UINT_32 sampleOffset;
+    UINT_32 pixelIndex;
+    UINT_32 pixelOffset;
+    UINT_32 elementOffset;
+    UINT_32 tileSplitSlice;
+    UINT_32 pipe;
+    UINT_32 bank;
+    UINT_64 sliceBytes;
+    UINT_64 sliceOffset;
+    UINT_32 macroTilePitch;
+    UINT_32 macroTileHeight;
+    UINT_32 macroTilesPerRow;
+    UINT_32 macroTilesPerSlice;
+    UINT_64 macroTileBytes;
+    UINT_32 macroTileIndexX;
+    UINT_32 macroTileIndexY;
+    UINT_64 macroTileOffset;
+    UINT_64 totalOffset;
+    UINT_64 pipeInterleaveMask;
+    UINT_64 bankInterleaveMask;
+    UINT_64 pipeInterleaveOffset;
+    UINT_32 bankInterleaveOffset;
+    UINT_64 offset;
+    UINT_32 tileRowIndex;
+    UINT_32 tileColumnIndex;
+    UINT_32 tileIndex;
+    UINT_32 tileOffset;
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    //
+    // Compute the number of group, pipe, and bank bits.
+    //
+    UINT_32 numPipes              = HwlGetPipes(pTileInfo);
+    UINT_32 numPipeInterleaveBits = Log2(m_pipeInterleaveBytes);
+    UINT_32 numPipeBits           = Log2(numPipes);
+    UINT_32 numBankInterleaveBits = Log2(m_bankInterleave);
+    UINT_32 numBankBits           = Log2(pTileInfo->banks);
+
+    //
+    // Compute the micro tile size.
+    //
+    microTileBits = MicroTilePixels * microTileThickness * bpp * numSamples;
+
+    microTileBytes = microTileBits / 8;
+    //
+    // Compute the pixel index within the micro tile.
+    //
+    pixelIndex = ComputePixelIndexWithinMicroTile(x,
+                                                  y,
+                                                  slice,
+                                                  bpp,
+                                                  tileMode,
+                                                  microTileType);
+
+    //
+    // Compute the sample offset and pixel offset.
+    //
+    if (isDepthSampleOrder)
+    {
+        //
+        // For depth surfaces, samples are stored contiguously for each element, so the sample
+        // offset is the sample number times the element size.
+        //
+        sampleOffset = sample * bpp;
+        pixelOffset  = pixelIndex * bpp * numSamples;
+    }
+    else
+    {
+        //
+        // For color surfaces, all elements for a particular sample are stored contiguously, so
+        // the sample offset is the sample number times the micro tile size divided yBit the number
+        // of samples.
+        //
+        sampleOffset = sample * (microTileBits / numSamples);
+        pixelOffset  = pixelIndex * bpp;
+    }
+
+    //
+    // Compute the element offset.
+    //
+    elementOffset = pixelOffset + sampleOffset;
+
+    *pBitPosition = static_cast<UINT_32>(elementOffset % 8);
+
+    elementOffset /= 8; //bit-to-byte
+
+    //
+    // Determine if tiles need to be split across slices.
+    //
+    // If the size of the micro tile is larger than the tile split size, then the tile will be
+    // split across multiple slices.
+    //
+    UINT_32 slicesPerTile = 1;
+
+    if ((microTileBytes > pTileInfo->tileSplitBytes) && (microTileThickness == 1))
+    {   //don't support for thick mode
+
+        //
+        // Compute the number of slices per tile.
+        //
+        slicesPerTile = microTileBytes / pTileInfo->tileSplitBytes;
+
+        //
+        // Compute the tile split slice number for use in rotating the bank.
+        //
+        tileSplitSlice = elementOffset / pTileInfo->tileSplitBytes;
+
+        //
+        // Adjust the element offset to account for the portion of the tile that is being moved to
+        // a new slice..
+        //
+        elementOffset %= pTileInfo->tileSplitBytes;
+
+        //
+        // Adjust the microTileBytes size to tileSplitBytes size since
+        // a new slice..
+        //
+        microTileBytes = pTileInfo->tileSplitBytes;
+    }
+    else
+    {
+        tileSplitSlice = 0;
+    }
+
+    //
+    // Compute macro tile pitch and height.
+    //
+    macroTilePitch  =
+        (MicroTileWidth  * pTileInfo->bankWidth  * numPipes) * pTileInfo->macroAspectRatio;
+    macroTileHeight =
+        (MicroTileHeight * pTileInfo->bankHeight * pTileInfo->banks) / pTileInfo->macroAspectRatio;
+
+    //
+    // Compute the number of bytes per macro tile. Note: bytes of the same bank/pipe actually
+    //
+    macroTileBytes =
+        static_cast<UINT_64>(microTileBytes) *
+        (macroTilePitch / MicroTileWidth) * (macroTileHeight / MicroTileHeight) /
+        (numPipes * pTileInfo->banks);
+
+    //
+    // Compute the number of macro tiles per row.
+    //
+    macroTilesPerRow = pitch / macroTilePitch;
+
+    //
+    // Compute the offset to the macro tile containing the specified coordinate.
+    //
+    macroTileIndexX = x / macroTilePitch;
+    macroTileIndexY = y / macroTileHeight;
+    macroTileOffset = ((macroTileIndexY * macroTilesPerRow) + macroTileIndexX) * macroTileBytes;
+
+    //
+    // Compute the number of macro tiles per slice.
+    //
+    macroTilesPerSlice = macroTilesPerRow  * (height / macroTileHeight);
+
+    //
+    // Compute the slice size.
+    //
+    sliceBytes = macroTilesPerSlice * macroTileBytes;
+
+    //
+    // Compute the slice offset.
+    //
+    sliceOffset = sliceBytes * (tileSplitSlice + slicesPerTile * (slice / microTileThickness));
+
+    //
+    // Compute tile offest
+    //
+    tileRowIndex    = (y / MicroTileHeight) % pTileInfo->bankHeight;
+    tileColumnIndex = ((x / MicroTileWidth) / numPipes) % pTileInfo->bankWidth;
+    tileIndex        = (tileRowIndex * pTileInfo->bankWidth) + tileColumnIndex;
+    tileOffset       = tileIndex * microTileBytes;
+
+    //
+    // Combine the slice offset and macro tile offset with the pixel and sample offsets, accounting
+    // for the pipe and bank bits in the middle of the address.
+    //
+    totalOffset = sliceOffset + macroTileOffset + elementOffset + tileOffset;
+
+    //
+    // Get the pipe and bank.
+    //
+
+    // when the tileMode is PRT type, then adjust x and y coordinates
+    if (IsPrtNoRotationTileMode(tileMode))
+    {
+        x = x % macroTilePitch;
+        y = y % macroTileHeight;
+    }
+
+    pipe = ComputePipeFromCoord(x,
+                                y,
+                                slice,
+                                tileMode,
+                                pipeSwizzle,
+                                ignoreSE,
+                                pTileInfo);
+
+    bank = ComputeBankFromCoord(x,
+                                y,
+                                slice,
+                                tileMode,
+                                bankSwizzle,
+                                tileSplitSlice,
+                                pTileInfo);
+
+
+    //
+    // Split the offset to put some bits below the pipe+bank bits and some above.
+    //
+    pipeInterleaveMask = (1 << numPipeInterleaveBits) - 1;
+    bankInterleaveMask = (1 << numBankInterleaveBits) - 1;
+    pipeInterleaveOffset = totalOffset & pipeInterleaveMask;
+    bankInterleaveOffset = static_cast<UINT_32>((totalOffset >> numPipeInterleaveBits) &
+                                                bankInterleaveMask);
+    offset               =  totalOffset >> (numPipeInterleaveBits + numBankInterleaveBits);
+
+    //
+    // Assemble the address from its components.
+    //
+    addr  = pipeInterleaveOffset;
+    // This is to remove /analyze warnings
+    UINT_32 pipeBits            = pipe                 <<  numPipeInterleaveBits;
+    UINT_32 bankInterleaveBits  = bankInterleaveOffset << (numPipeInterleaveBits + numPipeBits);
+    UINT_32 bankBits            = bank                 << (numPipeInterleaveBits + numPipeBits +
+                                                           numBankInterleaveBits);
+    UINT_64 offsetBits          = offset               << (numPipeInterleaveBits + numPipeBits +
+                                                           numBankInterleaveBits + numBankBits);
+
+    addr |= pipeBits;
+    addr |= bankInterleaveBits;
+    addr |= bankBits;
+    addr |= offsetBits;
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceAddrFromCoordMicroTiled
+*
+*   @brief
+*       Computes the surface address and bit position from a coordinate for 1D tilied
+*       (micro tiled)
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeSurfaceAddrFromCoordMicroTiled(
+    UINT_32             x,                      ///< [in] x coordinate
+    UINT_32             y,                      ///< [in] y coordinate
+    UINT_32             slice,                  ///< [in] slice index
+    UINT_32             sample,                 ///< [in] sample index
+    UINT_32             bpp,                    ///< [in] bits per pixel
+    UINT_32             pitch,                  ///< [in] pitch, in pixels
+    UINT_32             height,                 ///< [in] height, in pixels
+    UINT_32             numSamples,             ///< [in] number of samples
+    AddrTileMode        tileMode,               ///< [in] tile mode
+    AddrTileType        microTileType,          ///< [in] micro tiling type
+    BOOL_32             isDepthSampleOrder,     ///< [in] TRUE if depth sample ordering is used
+    UINT_32*            pBitPosition            ///< [out] bit position, e.g. FMT_1 will use this
+    ) const
+{
+    UINT_64 addr = 0;
+
+    UINT_32 microTileBytes;
+    UINT_64 sliceBytes;
+    UINT_32 microTilesPerRow;
+    UINT_32 microTileIndexX;
+    UINT_32 microTileIndexY;
+    UINT_32 microTileIndexZ;
+    UINT_64 sliceOffset;
+    UINT_64 microTileOffset;
+    UINT_32 sampleOffset;
+    UINT_32 pixelIndex;
+    UINT_32 pixelOffset;
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    //
+    // Compute the micro tile size.
+    //
+    microTileBytes = BITS_TO_BYTES(MicroTilePixels * microTileThickness * bpp * numSamples);
+
+    //
+    // Compute the slice size.
+    //
+    sliceBytes =
+        BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * microTileThickness * bpp * numSamples);
+
+    //
+    // Compute the number of micro tiles per row.
+    //
+    microTilesPerRow = pitch / MicroTileWidth;
+
+    //
+    // Compute the micro tile index.
+    //
+    microTileIndexX = x     / MicroTileWidth;
+    microTileIndexY = y     / MicroTileHeight;
+    microTileIndexZ = slice / microTileThickness;
+
+    //
+    // Compute the slice offset.
+    //
+    sliceOffset = static_cast<UINT_64>(microTileIndexZ) * sliceBytes;
+
+    //
+    // Compute the offset to the micro tile containing the specified coordinate.
+    //
+    microTileOffset = (static_cast<UINT_64>(microTileIndexY) * microTilesPerRow + microTileIndexX) *
+        microTileBytes;
+
+    //
+    // Compute the pixel index within the micro tile.
+    //
+    pixelIndex = ComputePixelIndexWithinMicroTile(x,
+                                                  y,
+                                                  slice,
+                                                  bpp,
+                                                  tileMode,
+                                                  microTileType);
+
+    // Compute the sample offset.
+    //
+    if (isDepthSampleOrder)
+    {
+        //
+        // For depth surfaces, samples are stored contiguously for each element, so the sample
+        // offset is the sample number times the element size.
+        //
+        sampleOffset = sample * bpp;
+        pixelOffset = pixelIndex * bpp * numSamples;
+    }
+    else
+    {
+        //
+        // For color surfaces, all elements for a particular sample are stored contiguously, so
+        // the sample offset is the sample number times the micro tile size divided yBit the number
+        // of samples.
+        //
+        sampleOffset = sample * (microTileBytes*8 / numSamples);
+        pixelOffset = pixelIndex * bpp;
+    }
+
+    //
+    // Compute the bit position of the pixel.  Each element is stored with one bit per sample.
+    //
+
+    UINT_32 elemOffset = sampleOffset + pixelOffset;
+
+    *pBitPosition = elemOffset % 8;
+    elemOffset /= 8;
+
+    //
+    // Combine the slice offset, micro tile offset, sample offset, and pixel offsets.
+    //
+    addr = sliceOffset + microTileOffset + elemOffset;
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputePixelCoordFromOffset
+*
+*   @brief
+*       Compute pixel coordinate from offset inside a micro tile
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::HwlComputePixelCoordFromOffset(
+    UINT_32         offset,             ///< [in] offset inside micro tile in bits
+    UINT_32         bpp,                ///< [in] bits per pixel
+    UINT_32         numSamples,         ///< [in] number of samples
+    AddrTileMode    tileMode,           ///< [in] tile mode
+    UINT_32         tileBase,           ///< [in] base offset within a tile
+    UINT_32         compBits,           ///< [in] component bits actually needed(for planar surface)
+    UINT_32*        pX,                 ///< [out] x coordinate
+    UINT_32*        pY,                 ///< [out] y coordinate
+    UINT_32*        pSlice,             ///< [out] slice index
+    UINT_32*        pSample,            ///< [out] sample index
+    AddrTileType    microTileType,      ///< [in] micro tiling type
+    BOOL_32         isDepthSampleOrder  ///< [in] TRUE if depth sample order in microtile is used
+    ) const
+{
+    UINT_32 x = 0;
+    UINT_32 y = 0;
+    UINT_32 z = 0;
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+
+    // For planar surface, we adjust offset acoording to tile base
+    if ((bpp != compBits) && (compBits != 0) && isDepthSampleOrder)
+    {
+        offset -= tileBase;
+
+        ADDR_ASSERT(microTileType == ADDR_NON_DISPLAYABLE ||
+                    microTileType == ADDR_DEPTH_SAMPLE_ORDER);
+
+        bpp = compBits;
+    }
+
+    UINT_32 sampleTileBits;
+    UINT_32 samplePixelBits;
+    UINT_32 pixelIndex;
+
+    if (isDepthSampleOrder)
+    {
+        samplePixelBits = bpp * numSamples;
+        pixelIndex = offset / samplePixelBits;
+        *pSample = (offset % samplePixelBits) / bpp;
+    }
+    else
+    {
+        sampleTileBits = MicroTilePixels * bpp * thickness;
+        *pSample = offset / sampleTileBits;
+        pixelIndex = (offset % sampleTileBits) / bpp;
+    }
+
+    if (microTileType != ADDR_THICK)
+    {
+        if (microTileType == ADDR_DISPLAYABLE) // displayable
+        {
+            switch (bpp)
+            {
+                case 8:
+                    x = pixelIndex & 0x7;
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,3),_BIT(pixelIndex,4));
+                    break;
+                case 16:
+                    x = pixelIndex & 0x7;
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,3));
+                    break;
+                case 32:
+                    x = Bits2Number(3, _BIT(pixelIndex,3),_BIT(pixelIndex,1),_BIT(pixelIndex,0));
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,2));
+                    break;
+                case 64:
+                    x = Bits2Number(3, _BIT(pixelIndex,3),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,1));
+                    break;
+                case 128:
+                    x = Bits2Number(3, _BIT(pixelIndex,3),_BIT(pixelIndex,2),_BIT(pixelIndex,1));
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,0));
+                    break;
+                default:
+                    break;
+            }
+        }
+        else if (microTileType == ADDR_NON_DISPLAYABLE || microTileType == ADDR_DEPTH_SAMPLE_ORDER)
+        {
+            x = Bits2Number(3, _BIT(pixelIndex,4),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+            y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,3),_BIT(pixelIndex,1));
+        }
+        else if (microTileType == ADDR_ROTATED)
+        {
+            /*
+                8-Bit Elements
+                element_index[5:0] = { x[2], x[0], x[1], y[2], y[1], y[0] }
+
+                16-Bit Elements
+                element_index[5:0] = { x[2], x[1], x[0], y[2], y[1], y[0] }
+
+                32-Bit Elements
+                element_index[5:0] = { x[2], x[1], y[2], x[0], y[1], y[0] }
+
+                64-Bit Elements
+                element_index[5:0] = { y[2], x[2], x[1], y[1], x[0], y[0] }
+            */
+            switch(bpp)
+            {
+                case 8:
+                    x = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,3),_BIT(pixelIndex,4));
+                    y = pixelIndex & 0x7;
+                    break;
+                case 16:
+                    x = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,3));
+                    y = pixelIndex & 0x7;
+                    break;
+                case 32:
+                    x = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,4),_BIT(pixelIndex,2));
+                    y = Bits2Number(3, _BIT(pixelIndex,3),_BIT(pixelIndex,1),_BIT(pixelIndex,0));
+                    break;
+                case 64:
+                    x = Bits2Number(3, _BIT(pixelIndex,4),_BIT(pixelIndex,3),_BIT(pixelIndex,1));
+                    y = Bits2Number(3, _BIT(pixelIndex,5),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    break;
+            }
+        }
+
+        if (thickness > 1) // thick
+        {
+            z = Bits2Number(3, _BIT(pixelIndex,8),_BIT(pixelIndex,7),_BIT(pixelIndex,6));
+        }
+    }
+    else
+    {
+        ADDR_ASSERT((m_chipFamily >= ADDR_CHIP_FAMILY_CI) && (thickness > 1));
+        /*
+            8-Bit Elements and 16-Bit Elements
+            element_index[7:0] = { y[2], x[2], z[1], z[0], y[1], x[1], y[0], x[0] }
+
+            32-Bit Elements
+            element_index[7:0] = { y[2], x[2], z[1], y[1], z[0], x[1], y[0], x[0] }
+
+            64-Bit Elements and 128-Bit Elements
+            element_index[7:0] = { y[2], x[2], z[1], y[1], x[1], z[0], y[0], x[0] }
+
+            The equation to compute the element index for the extra thick tile:
+            element_index[8] = z[2]
+        */
+        switch (bpp)
+        {
+            case 8:
+            case 16: // fall-through
+                x = Bits2Number(3, _BIT(pixelIndex,6),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+                y = Bits2Number(3, _BIT(pixelIndex,7),_BIT(pixelIndex,3),_BIT(pixelIndex,1));
+                z = Bits2Number(2, _BIT(pixelIndex,5),_BIT(pixelIndex,4));
+                break;
+            case 32:
+                x = Bits2Number(3, _BIT(pixelIndex,6),_BIT(pixelIndex,2),_BIT(pixelIndex,0));
+                y = Bits2Number(3, _BIT(pixelIndex,7),_BIT(pixelIndex,4),_BIT(pixelIndex,1));
+                z = Bits2Number(2, _BIT(pixelIndex,5),_BIT(pixelIndex,3));
+                break;
+            case 64:
+            case 128: // fall-through
+                x = Bits2Number(3, _BIT(pixelIndex,6),_BIT(pixelIndex,3),_BIT(pixelIndex,0));
+                y = Bits2Number(3, _BIT(pixelIndex,7),_BIT(pixelIndex,4),_BIT(pixelIndex,1));
+                z = Bits2Number(2, _BIT(pixelIndex,5),_BIT(pixelIndex,2));
+                break;
+            default:
+                ADDR_ASSERT_ALWAYS();
+                break;
+        }
+
+        if (thickness == 8)
+        {
+            z += Bits2Number(3,_BIT(pixelIndex,8),0,0);
+        }
+    }
+
+    *pX = x;
+    *pY = y;
+    *pSlice += z;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeSurfaceCoordFromAddrDispatch
+*
+*   @brief
+*       Compute (x,y,slice,sample) coordinates from surface address
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::DispatchComputeSurfaceCoordFromAddr(
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    UINT_64             addr               = pIn->addr;
+    UINT_32             bitPosition        = pIn->bitPosition;
+    UINT_32             bpp                = pIn->bpp;
+    UINT_32             pitch              = pIn->pitch;
+    UINT_32             height             = pIn->height;
+    UINT_32             numSlices          = pIn->numSlices;
+    UINT_32             numSamples         = ((pIn->numSamples == 0) ? 1 : pIn->numSamples);
+    UINT_32             numFrags           = ((pIn->numFrags == 0) ? numSamples : pIn->numFrags);
+    AddrTileMode        tileMode           = pIn->tileMode;
+    UINT_32             tileBase           = pIn->tileBase;
+    UINT_32             compBits           = pIn->compBits;
+    AddrTileType        microTileType      = pIn->tileType;
+    BOOL_32             ignoreSE           = pIn->ignoreSE;
+    BOOL_32             isDepthSampleOrder = pIn->isDepth;
+    ADDR_TILEINFO*      pTileInfo          = pIn->pTileInfo;
+
+    UINT_32*            pX                 = &pOut->x;
+    UINT_32*            pY                 = &pOut->y;
+    UINT_32*            pSlice             = &pOut->slice;
+    UINT_32*            pSample            = &pOut->sample;
+
+    if (microTileType == ADDR_DEPTH_SAMPLE_ORDER)
+    {
+        isDepthSampleOrder = TRUE;
+    }
+
+    if (m_chipFamily >= ADDR_CHIP_FAMILY_NI)
+    {
+        if (numFrags != numSamples)
+        {
+            numSamples = numFrags;
+        }
+
+        /// @note
+        /// 128 bit/thick tiled surface doesn't support display tiling and
+        /// mipmap chain must have the same tileType, so please fill tileType correctly
+        if (!IsLinear(pIn->tileMode))
+        {
+            if (bpp >= 128 || ComputeSurfaceThickness(tileMode) > 1)
+            {
+                ADDR_ASSERT(microTileType != ADDR_DISPLAYABLE);
+            }
+        }
+    }
+
+    switch (tileMode)
+    {
+        case ADDR_TM_LINEAR_GENERAL://fall through
+        case ADDR_TM_LINEAR_ALIGNED:
+            ComputeSurfaceCoordFromAddrLinear(addr,
+                                              bitPosition,
+                                              bpp,
+                                              pitch,
+                                              height,
+                                              numSlices,
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pSample);
+            break;
+        case ADDR_TM_1D_TILED_THIN1://fall through
+        case ADDR_TM_1D_TILED_THICK:
+            ComputeSurfaceCoordFromAddrMicroTiled(addr,
+                                                  bitPosition,
+                                                  bpp,
+                                                  pitch,
+                                                  height,
+                                                  numSamples,
+                                                  tileMode,
+                                                  tileBase,
+                                                  compBits,
+                                                  pX,
+                                                  pY,
+                                                  pSlice,
+                                                  pSample,
+                                                  microTileType,
+                                                  isDepthSampleOrder);
+            break;
+        case ADDR_TM_2D_TILED_THIN1:    //fall through
+        case ADDR_TM_2D_TILED_THICK:    //fall through
+        case ADDR_TM_3D_TILED_THIN1:    //fall through
+        case ADDR_TM_3D_TILED_THICK:    //fall through
+        case ADDR_TM_2D_TILED_XTHICK:   //fall through
+        case ADDR_TM_3D_TILED_XTHICK:   //fall through
+        case ADDR_TM_PRT_TILED_THIN1:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1://fall through
+        case ADDR_TM_PRT_TILED_THICK:   //fall through
+        case ADDR_TM_PRT_2D_TILED_THICK://fall through
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            UINT_32 pipeSwizzle;
+            UINT_32 bankSwizzle;
+
+            if (m_configFlags.useCombinedSwizzle)
+            {
+                ExtractBankPipeSwizzle(pIn->tileSwizzle, pIn->pTileInfo,
+                                       &bankSwizzle, &pipeSwizzle);
+            }
+            else
+            {
+                pipeSwizzle = pIn->pipeSwizzle;
+                bankSwizzle = pIn->bankSwizzle;
+            }
+
+            ComputeSurfaceCoordFromAddrMacroTiled(addr,
+                                                  bitPosition,
+                                                  bpp,
+                                                  pitch,
+                                                  height,
+                                                  numSamples,
+                                                  tileMode,
+                                                  tileBase,
+                                                  compBits,
+                                                  microTileType,
+                                                  ignoreSE,
+                                                  isDepthSampleOrder,
+                                                  pipeSwizzle,
+                                                  bankSwizzle,
+                                                  pTileInfo,
+                                                  pX,
+                                                  pY,
+                                                  pSlice,
+                                                  pSample);
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+    }
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceCoordFromAddrMacroTiled
+*
+*   @brief
+*       Compute surface coordinates from address for macro tiled surface
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ComputeSurfaceCoordFromAddrMacroTiled(
+    UINT_64             addr,               ///< [in] byte address
+    UINT_32             bitPosition,        ///< [in] bit position
+    UINT_32             bpp,                ///< [in] bits per pixel
+    UINT_32             pitch,              ///< [in] pitch in pixels
+    UINT_32             height,             ///< [in] height in pixels
+    UINT_32             numSamples,         ///< [in] number of samples
+    AddrTileMode        tileMode,           ///< [in] tile mode
+    UINT_32             tileBase,           ///< [in] tile base offset
+    UINT_32             compBits,           ///< [in] component bits (for planar surface)
+    AddrTileType        microTileType,      ///< [in] micro tiling type
+    BOOL_32             ignoreSE,           ///< [in] TRUE if shader engines can be ignored
+    BOOL_32             isDepthSampleOrder, ///< [in] TRUE if depth sample order is used
+    UINT_32             pipeSwizzle,        ///< [in] pipe swizzle
+    UINT_32             bankSwizzle,        ///< [in] bank swizzle
+    ADDR_TILEINFO*      pTileInfo,          ///< [in] bank structure.
+                                            ///  **All fields to be valid on entry**
+    UINT_32*            pX,                 ///< [out] X coord
+    UINT_32*            pY,                 ///< [out] Y coord
+    UINT_32*            pSlice,             ///< [out] slice index
+    UINT_32*            pSample             ///< [out] sample index
+    ) const
+{
+    UINT_32 mx;
+    UINT_32 my;
+    UINT_64 tileBits;
+    UINT_64 macroTileBits;
+    UINT_32 slices;
+    UINT_32 tileSlices;
+    UINT_64 elementOffset;
+    UINT_64 macroTileIndex;
+    UINT_32 tileIndex;
+    UINT_64 totalOffset;
+
+
+    UINT_32 bank;
+    UINT_32 pipe;
+    UINT_32 groupBits = m_pipeInterleaveBytes << 3;
+    UINT_32 pipes = HwlGetPipes(pTileInfo);
+    UINT_32 banks = pTileInfo->banks;
+
+    UINT_32 bankInterleave = m_bankInterleave;
+
+    UINT_64 addrBits = BYTES_TO_BITS(addr) + bitPosition;
+
+    //
+    // remove bits for bank and pipe
+    //
+    totalOffset = (addrBits % groupBits) +
+        (((addrBits / groupBits / pipes) % bankInterleave) * groupBits) +
+        (((addrBits / groupBits / pipes) / bankInterleave) / banks) * groupBits * bankInterleave;
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    UINT_32 microTileBits = bpp * microTileThickness * MicroTilePixels * numSamples;
+
+    UINT_32 microTileBytes = BITS_TO_BYTES(microTileBits);
+    //
+    // Determine if tiles need to be split across slices.
+    //
+    // If the size of the micro tile is larger than the tile split size, then the tile will be
+    // split across multiple slices.
+    //
+    UINT_32 slicesPerTile = 1; //_State->TileSlices
+
+    if ((microTileBytes > pTileInfo->tileSplitBytes) && (microTileThickness == 1))
+    {   //don't support for thick mode
+
+        //
+        // Compute the number of slices per tile.
+        //
+        slicesPerTile = microTileBytes / pTileInfo->tileSplitBytes;
+    }
+
+    tileBits = microTileBits / slicesPerTile; // micro tile bits
+
+    // in micro tiles because not MicroTileWidth timed.
+    UINT_32 macroWidth  = pTileInfo->bankWidth * pipes * pTileInfo->macroAspectRatio;
+    // in micro tiles as well
+    UINT_32 macroHeight = pTileInfo->bankHeight * banks / pTileInfo->macroAspectRatio;
+
+    UINT_32 pitchInMacroTiles = pitch / MicroTileWidth / macroWidth;
+
+    macroTileBits = (macroWidth * macroHeight) * tileBits / (banks * pipes);
+
+    macroTileIndex = totalOffset / macroTileBits;
+
+    // pitchMacros * height / heightMacros;  macroTilesPerSlice == _State->SliceMacros
+    UINT_32 macroTilesPerSlice = (pitch / (macroWidth * MicroTileWidth)) * height /
+        (macroHeight * MicroTileWidth);
+
+    slices = static_cast<UINT_32>(macroTileIndex / macroTilesPerSlice);
+
+    *pSlice = static_cast<UINT_32>(slices / slicesPerTile * microTileThickness);
+
+    //
+    // calculate element offset and x[2:0], y[2:0], z[1:0] for thick
+    //
+    tileSlices = slices % slicesPerTile;
+
+    elementOffset  = tileSlices * tileBits;
+    elementOffset += totalOffset % tileBits;
+
+    UINT_32 coordZ = 0;
+
+    HwlComputePixelCoordFromOffset(static_cast<UINT_32>(elementOffset),
+                                   bpp,
+                                   numSamples,
+                                   tileMode,
+                                   tileBase,
+                                   compBits,
+                                   pX,
+                                   pY,
+                                   &coordZ,
+                                   pSample,
+                                   microTileType,
+                                   isDepthSampleOrder);
+
+    macroTileIndex = macroTileIndex % macroTilesPerSlice;
+    *pY += static_cast<UINT_32>(macroTileIndex / pitchInMacroTiles * macroHeight * MicroTileHeight);
+    *pX += static_cast<UINT_32>(macroTileIndex % pitchInMacroTiles * macroWidth * MicroTileWidth);
+
+    *pSlice += coordZ;
+
+    tileIndex = static_cast<UINT_32>((totalOffset % macroTileBits) / tileBits);
+
+    my = (tileIndex / pTileInfo->bankWidth) % pTileInfo->bankHeight * MicroTileHeight;
+    mx = (tileIndex % pTileInfo->bankWidth) * pipes * MicroTileWidth;
+
+    *pY += my;
+    *pX += mx;
+
+    bank = ComputeBankFromAddr(addr, banks, pipes);
+    pipe = ComputePipeFromAddr(addr, pipes);
+
+    HwlComputeSurfaceCoord2DFromBankPipe(tileMode,
+                                         pX,
+                                         pY,
+                                         *pSlice,
+                                         bank,
+                                         pipe,
+                                         bankSwizzle,
+                                         pipeSwizzle,
+                                         tileSlices,
+                                         ignoreSE,
+                                         pTileInfo);
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSurfaceCoord2DFromBankPipe
+*
+*   @brief
+*       Compute surface x,y coordinates from bank/pipe info
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ComputeSurfaceCoord2DFromBankPipe(
+    AddrTileMode        tileMode,   ///< [in] tile mode
+    UINT_32             x,          ///< [in] x coordinate
+    UINT_32             y,          ///< [in] y coordinate
+    UINT_32             slice,      ///< [in] slice index
+    UINT_32             bank,       ///< [in] bank number
+    UINT_32             pipe,       ///< [in] pipe number
+    UINT_32             bankSwizzle,///< [in] bank swizzle
+    UINT_32             pipeSwizzle,///< [in] pipe swizzle
+    UINT_32             tileSlices, ///< [in] slices in a micro tile
+    ADDR_TILEINFO*      pTileInfo,  ///< [in] bank structure. **All fields to be valid on entry**
+    CoordFromBankPipe*  pOutput     ///< [out] pointer to extracted x/y bits
+    ) const
+{
+    UINT_32 yBit3 = 0;
+    UINT_32 yBit4 = 0;
+    UINT_32 yBit5 = 0;
+    UINT_32 yBit6 = 0;
+
+    UINT_32 xBit3 = 0;
+    UINT_32 xBit4 = 0;
+    UINT_32 xBit5 = 0;
+
+    UINT_32 tileSplitRotation;
+
+    UINT_32 numPipes = HwlGetPipes(pTileInfo);
+
+    UINT_32 bankRotation = ComputeBankRotation(tileMode,
+                                               pTileInfo->banks, numPipes);
+
+    UINT_32 pipeRotation = ComputePipeRotation(tileMode, numPipes);
+
+    UINT_32 xBit = x / (MicroTileWidth * pTileInfo->bankWidth * numPipes);
+    UINT_32 yBit = y / (MicroTileHeight * pTileInfo->bankHeight);
+
+    //calculate the bank and pipe before rotation and swizzle
+
+    switch (tileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1:  //fall through
+        case ADDR_TM_2D_TILED_THICK:  //fall through
+        case ADDR_TM_2D_TILED_XTHICK: //fall through
+        case ADDR_TM_3D_TILED_THIN1:  //fall through
+        case ADDR_TM_3D_TILED_THICK:  //fall through
+        case ADDR_TM_3D_TILED_XTHICK:
+            tileSplitRotation = ((pTileInfo->banks / 2) + 1);
+            break;
+        default:
+            tileSplitRotation =  0;
+            break;
+    }
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    bank ^= tileSplitRotation * tileSlices;
+    if (pipeRotation == 0)
+    {
+        bank ^= bankRotation * (slice / microTileThickness) + bankSwizzle;
+        bank %= pTileInfo->banks;
+        pipe ^= pipeSwizzle;
+    }
+    else
+    {
+        bank ^= bankRotation * (slice / microTileThickness) / numPipes + bankSwizzle;
+        bank %= pTileInfo->banks;
+        pipe ^= pipeRotation * (slice / microTileThickness) + pipeSwizzle;
+    }
+
+    if (pTileInfo->macroAspectRatio == 1)
+    {
+        switch (pTileInfo->banks)
+        {
+            case 2:
+                yBit3 = _BIT(bank, 0) ^ _BIT(xBit,0);
+                break;
+            case 4:
+                yBit4 = _BIT(bank, 0) ^ _BIT(xBit,0);
+                yBit3 = _BIT(bank, 1) ^ _BIT(xBit,1);
+                break;
+            case 8:
+                yBit3 = _BIT(bank, 2) ^ _BIT(xBit,2);
+                yBit5 = _BIT(bank, 0) ^ _BIT(xBit,0);
+                yBit4 = _BIT(bank, 1) ^ _BIT(xBit,1) ^ yBit5;
+                break;
+            case 16:
+                yBit3 = _BIT(bank, 3) ^ _BIT(xBit, 3);
+                yBit4 = _BIT(bank, 2) ^ _BIT(xBit, 2);
+                yBit6 = _BIT(bank, 0) ^ _BIT(xBit, 0);
+                yBit5 = _BIT(bank, 1) ^ _BIT(xBit, 1) ^ yBit6;
+                break;
+            default:
+                break;
+        }
+
+    }
+    else if (pTileInfo->macroAspectRatio == 2)
+    {
+        switch (pTileInfo->banks)
+        {
+            case 2: //xBit3 = yBit3^b0
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,0);
+                break;
+            case 4: //xBit3=yBit4^b0; yBit3=xBit4^b1
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,1);
+                yBit3 = _BIT(bank, 1) ^ _BIT(xBit,1);
+                break;
+            case 8: //xBit4, xBit5, yBit5 are known
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,2);
+                yBit3 = _BIT(bank, 2) ^ _BIT(xBit,2);
+                yBit4 = _BIT(bank, 1) ^ _BIT(xBit,1) ^ _BIT(yBit, 2);
+                break;
+            case 16://x4,x5,x6,y6 are known
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit, 3); //x3 = y6 ^ b0
+                yBit3 = _BIT(bank, 3) ^ _BIT(xBit, 3); //y3 = x6 ^ b3
+                yBit4 = _BIT(bank, 2) ^ _BIT(xBit, 2); //y4 = x5 ^ b2
+                yBit5 = _BIT(bank, 1) ^ _BIT(xBit, 1) ^ _BIT(yBit, 3); //y5=x4^y6^b1
+                break;
+            default:
+                break;
+        }
+    }
+    else if (pTileInfo->macroAspectRatio == 4)
+    {
+        switch (pTileInfo->banks)
+        {
+            case 4: //yBit3, yBit4
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,1);
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit,0);
+                break;
+            case 8: //xBit5, yBit4, yBit5
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,2);
+                yBit3 = _BIT(bank, 2) ^ _BIT(xBit,2);
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit,1) ^  _BIT(yBit,2);
+                break;
+            case 16: //xBit5, xBit6, yBit5, yBit6
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit, 3);//x3 = b0 ^ y6
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit, 2) ^ _BIT(yBit, 3);//x4 = b1 ^ y5 ^ y6;
+                yBit3 = _BIT(bank, 3) ^ _BIT(xBit, 3); //y3 = b3 ^ x6;
+                yBit4 = _BIT(bank, 2) ^ _BIT(xBit, 2); //y4 = b2 ^ x5;
+                break;
+            default:
+                break;
+        }
+    }
+    else if (pTileInfo->macroAspectRatio == 8)
+    {
+        switch (pTileInfo->banks)
+        {
+            case 8: //yBit3, yBit4, yBit5
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit,2); //x3 = b0 ^ y5;
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit,1) ^ _BIT(yBit, 2);//x4 = b1 ^ y4 ^ y5;
+                xBit5 = _BIT(bank, 2) ^ _BIT(yBit,0);
+                break;
+            case 16: //xBit6, yBit4, yBit5, yBit6
+                xBit3 = _BIT(bank, 0) ^ _BIT(yBit, 3);//x3 = y6 ^ b0
+                xBit4 = _BIT(bank, 1) ^ _BIT(yBit, 2) ^ _BIT(yBit, 3);//x4 = y5 ^ y6 ^ b1
+                xBit5 = _BIT(bank, 2) ^ _BIT(yBit, 1);//x5 = y4 ^ b2
+                yBit3 = _BIT(bank, 3) ^ _BIT(xBit, 3); //y3 = x6 ^ b3
+                break;
+            default:
+                break;
+        }
+    }
+
+    pOutput->xBits = xBit;
+    pOutput->yBits = yBit;
+
+    pOutput->xBit3 = xBit3;
+    pOutput->xBit4 = xBit4;
+    pOutput->xBit5 = xBit5;
+    pOutput->yBit3 = yBit3;
+    pOutput->yBit4 = yBit4;
+    pOutput->yBit5 = yBit5;
+    pOutput->yBit6 = yBit6;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlExtractBankPipeSwizzle
+*   @brief
+*       Entry of EgBasedAddrLib ExtractBankPipeSwizzle
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlExtractBankPipeSwizzle(
+    const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT*  pIn,   ///< [in] input structure
+    ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT*       pOut   ///< [out] output structure
+    ) const
+{
+    ExtractBankPipeSwizzle(pIn->base256b,
+                           pIn->pTileInfo,
+                           &pOut->bankSwizzle,
+                           &pOut->pipeSwizzle);
+
+    return ADDR_OK;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlCombineBankPipeSwizzle
+*   @brief
+*       Combine bank/pipe swizzle
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlCombineBankPipeSwizzle(
+    UINT_32         bankSwizzle,    ///< [in] bank swizzle
+    UINT_32         pipeSwizzle,    ///< [in] pipe swizzle
+    ADDR_TILEINFO*  pTileInfo,      ///< [in] tile info
+    UINT_64         baseAddr,       ///< [in] base address
+    UINT_32*        pTileSwizzle    ///< [out] combined swizzle
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if (pTileSwizzle)
+    {
+        *pTileSwizzle = GetBankPipeSwizzle(bankSwizzle, pipeSwizzle, baseAddr, pTileInfo);
+    }
+    else
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeBaseSwizzle
+*   @brief
+*       Compute base swizzle
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeBaseSwizzle(
+    const ADDR_COMPUTE_BASE_SWIZZLE_INPUT* pIn,
+    ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut
+    ) const
+{
+    UINT_32 bankSwizzle = 0;
+    UINT_32 pipeSwizzle = 0;
+    ADDR_TILEINFO* pTileInfo = pIn->pTileInfo;
+
+    ADDR_ASSERT(IsMacroTiled(pIn->tileMode));
+    ADDR_ASSERT(pIn->pTileInfo);
+
+    /// This is a legacy misreading of h/w doc, use it as it doesn't hurt.
+    static const UINT_8 bankRotationArray[4][16] = {
+        { 0, 0,  0, 0,  0, 0,  0, 0, 0,  0, 0,  0, 0,  0, 0, 0 }, // ADDR_SURF_2_BANK
+        { 0, 1,  2, 3,  0, 0,  0, 0, 0,  0, 0,  0, 0,  0, 0, 0 }, // ADDR_SURF_4_BANK
+        { 0, 3,  6, 1,  4, 7,  2, 5, 0,  0, 0,  0, 0,  0, 0, 0 }, // ADDR_SURF_8_BANK
+        { 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 }, // ADDR_SURF_16_BANK
+    };
+
+    UINT_32 banks = pTileInfo ? pTileInfo->banks : 2;
+    UINT_32 hwNumBanks;
+
+    // Uses less bank swizzle bits
+    if (pIn->option.reduceBankBit && banks > 2)
+    {
+        banks >>= 1;
+    }
+
+    switch (banks)
+    {
+        case 2:
+            hwNumBanks = 0;
+            break;
+        case 4:
+            hwNumBanks = 1;
+            break;
+        case 8:
+            hwNumBanks = 2;
+            break;
+        case 16:
+            hwNumBanks = 3;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            hwNumBanks = 0;
+            break;
+    }
+
+    if (pIn->option.genOption == ADDR_SWIZZLE_GEN_LINEAR)
+    {
+        bankSwizzle = pIn->surfIndex & (banks - 1);
+    }
+    else // (pIn->option.genOption == ADDR_SWIZZLE_GEN_DEFAULT)
+    {
+        bankSwizzle = bankRotationArray[hwNumBanks][pIn->surfIndex & (banks - 1)];
+    }
+
+    if (IsMacro3dTiled(pIn->tileMode))
+    {
+        pipeSwizzle = pIn->surfIndex & (HwlGetPipes(pTileInfo) - 1);
+    }
+
+    return HwlCombineBankPipeSwizzle(bankSwizzle, pipeSwizzle, pTileInfo, 0, &pOut->tileSwizzle);
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ExtractBankPipeSwizzle
+*   @brief
+*       Extract bank/pipe swizzle from base256b
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ExtractBankPipeSwizzle(
+    UINT_32         base256b,       ///< [in] input base256b register value
+    ADDR_TILEINFO*  pTileInfo,      ///< [in] 2D tile parameters. Client must provide all data
+    UINT_32*        pBankSwizzle,   ///< [out] bank swizzle
+    UINT_32*        pPipeSwizzle    ///< [out] pipe swizzle
+    ) const
+{
+    UINT_32 bankSwizzle = 0;
+    UINT_32 pipeSwizzle = 0;
+
+    if (base256b != 0)
+    {
+        UINT_32 numPipes        = HwlGetPipes(pTileInfo);
+        UINT_32 bankBits        = QLog2(pTileInfo->banks);
+        UINT_32 pipeBits        = QLog2(numPipes);
+        UINT_32 groupBytes      = m_pipeInterleaveBytes;
+        UINT_32 bankInterleave  = m_bankInterleave;
+
+        pipeSwizzle =
+            (base256b / (groupBytes >> 8)) & ((1<<pipeBits)-1);
+
+        bankSwizzle =
+            (base256b / (groupBytes >> 8) / numPipes / bankInterleave) & ((1 << bankBits) - 1);
+    }
+
+    *pPipeSwizzle = pipeSwizzle;
+    *pBankSwizzle = bankSwizzle;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::GetBankPipeSwizzle
+*   @brief
+*       Combine bank/pipe swizzle
+*   @return
+*       Base256b bits (only filled bank/pipe bits)
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::GetBankPipeSwizzle(
+    UINT_32         bankSwizzle,    ///< [in] bank swizzle
+    UINT_32         pipeSwizzle,    ///< [in] pipe swizzle
+    UINT_64         baseAddr,       ///< [in] base address
+    ADDR_TILEINFO*  pTileInfo       ///< [in] tile info
+    ) const
+{
+    UINT_32 pipeBits = QLog2(HwlGetPipes(pTileInfo));
+    UINT_32 bankInterleaveBits = QLog2(m_bankInterleave);
+    UINT_32 tileSwizzle = pipeSwizzle + ((bankSwizzle << bankInterleaveBits) << pipeBits);
+
+    baseAddr ^= tileSwizzle * m_pipeInterleaveBytes;
+    baseAddr >>= 8;
+
+    return static_cast<UINT_32>(baseAddr);
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeSliceTileSwizzle
+*   @brief
+*       Compute cubemap/3d texture faces/slices tile swizzle
+*   @return
+*       Tile swizzle
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeSliceTileSwizzle(
+    AddrTileMode        tileMode,       ///< [in] Tile mode
+    UINT_32             baseSwizzle,    ///< [in] Base swizzle
+    UINT_32             slice,          ///< [in] Slice index, Cubemap face index, 0 means +X
+    UINT_64             baseAddr,       ///< [in] Base address
+    ADDR_TILEINFO* pTileInfo       ///< [in] Bank structure
+    ) const
+{
+    UINT_32 tileSwizzle = 0;
+
+    if (IsMacroTiled(tileMode)) // Swizzle only for macro tile mode
+    {
+        UINT_32 firstSlice = slice / ComputeSurfaceThickness(tileMode);
+
+        UINT_32 numPipes = HwlGetPipes(pTileInfo);
+        UINT_32 numBanks = pTileInfo->banks;
+
+        UINT_32 pipeRotation;
+        UINT_32 bankRotation;
+
+        UINT_32 bankSwizzle = 0;
+        UINT_32 pipeSwizzle = 0;
+
+        pipeRotation = ComputePipeRotation(tileMode, numPipes);
+        bankRotation = ComputeBankRotation(tileMode, numBanks, numPipes);
+
+        if (baseSwizzle != 0)
+        {
+            ExtractBankPipeSwizzle(baseSwizzle,
+                                   pTileInfo,
+                                   &bankSwizzle,
+                                   &pipeSwizzle);
+        }
+
+        if (pipeRotation == 0) //2D mode
+        {
+            bankSwizzle += firstSlice * bankRotation;
+            bankSwizzle %= numBanks;
+        }
+        else //3D mode
+        {
+            pipeSwizzle += firstSlice * pipeRotation;
+            pipeSwizzle %= numPipes;
+            bankSwizzle += firstSlice * bankRotation / numPipes;
+            bankSwizzle %= numBanks;
+        }
+
+        tileSwizzle = GetBankPipeSwizzle(bankSwizzle,
+                                         pipeSwizzle,
+                                         baseAddr,
+                                         pTileInfo);
+    }
+
+    return tileSwizzle;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeQbStereoRightSwizzle
+*
+*   @brief
+*       Compute right eye swizzle
+*   @return
+*       swizzle
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::HwlComputeQbStereoRightSwizzle(
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pInfo  ///< [in] Surface info, must be valid
+    ) const
+{
+    UINT_32 bankBits    = 0;
+    UINT_32 swizzle     = 0;
+
+    // The assumption is default swizzle for left eye is 0
+    if (IsMacroTiled(pInfo->tileMode) && pInfo->pStereoInfo && pInfo->pTileInfo)
+    {
+        bankBits = ComputeBankFromCoord(0, pInfo->height, 0,
+                                        pInfo->tileMode, 0, 0, pInfo->pTileInfo);
+
+        if (bankBits)
+        {
+            HwlCombineBankPipeSwizzle(bankBits, 0, pInfo->pTileInfo, 0, &swizzle);
+        }
+    }
+
+    return swizzle;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeBankFromCoord
+*
+*   @brief
+*       Compute bank number from coordinates
+*   @return
+*       Bank number
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeBankFromCoord(
+    UINT_32         x,              ///< [in] x coordinate
+    UINT_32         y,              ///< [in] y coordinate
+    UINT_32         slice,          ///< [in] slice index
+    AddrTileMode    tileMode,       ///< [in] tile mode
+    UINT_32         bankSwizzle,    ///< [in] bank swizzle
+    UINT_32         tileSplitSlice, ///< [in] If the size of the pixel offset is larger than the
+                                    ///  tile split size, then the pixel will be moved to a separate
+                                    ///  slice. This value equals pixelOffset / tileSplitBytes
+                                    ///  in this case. Otherwise this is 0.
+    ADDR_TILEINFO*  pTileInfo       ///< [in] tile info
+    ) const
+{
+    UINT_32 pipes = HwlGetPipes(pTileInfo);
+    UINT_32 bankBit0 = 0;
+    UINT_32 bankBit1 = 0;
+    UINT_32 bankBit2 = 0;
+    UINT_32 bankBit3 = 0;
+    UINT_32 sliceRotation;
+    UINT_32 tileSplitRotation;
+    UINT_32 bank;
+    UINT_32 numBanks    = pTileInfo->banks;
+    UINT_32 bankWidth   = pTileInfo->bankWidth;
+    UINT_32 bankHeight  = pTileInfo->bankHeight;
+
+    UINT_32 tx = x / MicroTileWidth / (bankWidth * pipes);
+    UINT_32 ty = y / MicroTileHeight / bankHeight;
+
+    UINT_32 x3 = _BIT(tx,0);
+    UINT_32 x4 = _BIT(tx,1);
+    UINT_32 x5 = _BIT(tx,2);
+    UINT_32 x6 = _BIT(tx,3);
+    UINT_32 y3 = _BIT(ty,0);
+    UINT_32 y4 = _BIT(ty,1);
+    UINT_32 y5 = _BIT(ty,2);
+    UINT_32 y6 = _BIT(ty,3);
+
+    switch (numBanks)
+    {
+        case 16:
+            bankBit0 = x3 ^ y6;
+            bankBit1 = x4 ^ y5 ^ y6;
+            bankBit2 = x5 ^ y4;
+            bankBit3 = x6 ^ y3;
+            break;
+        case 8:
+            bankBit0 = x3 ^ y5;
+            bankBit1 = x4 ^ y4 ^ y5;
+            bankBit2 = x5 ^ y3;
+            break;
+        case 4:
+            bankBit0 = x3 ^ y4;
+            bankBit1 = x4 ^ y3;
+            break;
+        case 2:
+            bankBit0 = x3 ^ y3;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            break;
+    }
+
+    bank = bankBit0 | (bankBit1 << 1) | (bankBit2 << 2) | (bankBit3 << 3);
+
+    //Bits2Number(4, bankBit3, bankBit2, bankBit1, bankBit0);
+
+    bank = HwlPreAdjustBank((x / MicroTileWidth), bank, pTileInfo);
+    //
+    // Compute bank rotation for the slice.
+    //
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    switch (tileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1:  // fall through
+        case ADDR_TM_2D_TILED_THICK:  // fall through
+        case ADDR_TM_2D_TILED_XTHICK:
+            sliceRotation = ((numBanks / 2) - 1) * (slice / microTileThickness);
+            break;
+        case ADDR_TM_3D_TILED_THIN1:  // fall through
+        case ADDR_TM_3D_TILED_THICK:  // fall through
+        case ADDR_TM_3D_TILED_XTHICK:
+            sliceRotation =
+                Max(1u, (pipes / 2) - 1) * (slice / microTileThickness) / pipes;
+            break;
+        default:
+            sliceRotation =  0;
+            break;
+    }
+
+
+    //
+    // Compute bank rotation for the tile split slice.
+    //
+    // The sample slice will be non-zero if samples must be split across multiple slices.
+    // This situation arises when the micro tile size multiplied yBit the number of samples exceeds
+    // the split size (set in GB_ADDR_CONFIG).
+    //
+    switch (tileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1: //fall through
+        case ADDR_TM_3D_TILED_THIN1: //fall through
+        case ADDR_TM_PRT_2D_TILED_THIN1: //fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1: //fall through
+            tileSplitRotation = ((numBanks / 2) + 1) * tileSplitSlice;
+            break;
+        default:
+            tileSplitRotation =  0;
+            break;
+    }
+
+    //
+    // Apply bank rotation for the slice and tile split slice.
+    //
+    bank ^= bankSwizzle + sliceRotation;
+    bank ^= tileSplitRotation;
+
+    bank &= (numBanks - 1);
+
+    return bank;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeBankFromAddr
+*
+*   @brief
+*       Compute the bank number from an address
+*   @return
+*       Bank number
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeBankFromAddr(
+    UINT_64 addr,       ///< [in] address
+    UINT_32 numBanks,   ///< [in] number of banks
+    UINT_32 numPipes    ///< [in] number of pipes
+    ) const
+{
+    UINT_32 bank;
+
+    //
+    // The LSBs of the address are arranged as follows:
+    //   bank | bankInterleave | pipe | pipeInterleave
+    //
+    // To get the bank number, shift off the pipe interleave, pipe, and bank interlave bits and
+    // mask the bank bits.
+    //
+    bank = static_cast<UINT_32>(
+        (addr >> Log2(m_pipeInterleaveBytes * numPipes * m_bankInterleave)) &
+        (numBanks - 1)
+        );
+
+    return bank;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputePipeRotation
+*
+*   @brief
+*       Compute pipe rotation value
+*   @return
+*       Pipe rotation
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputePipeRotation(
+    AddrTileMode tileMode,  ///< [in] tile mode
+    UINT_32      numPipes   ///< [in] number of pipes
+    ) const
+{
+   UINT_32 rotation;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_3D_TILED_THIN1:        //fall through
+        case ADDR_TM_3D_TILED_THICK:        //fall through
+        case ADDR_TM_3D_TILED_XTHICK:       //fall through
+        case ADDR_TM_PRT_3D_TILED_THIN1:    //fall through
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            rotation = (numPipes < 4) ? 1 : (numPipes / 2 - 1);
+            break;
+        default:
+            rotation = 0;
+    }
+
+    return rotation;
+}
+
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeBankRotation
+*
+*   @brief
+*       Compute bank rotation value
+*   @return
+*       Bank rotation
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeBankRotation(
+    AddrTileMode tileMode,  ///< [in] tile mode
+    UINT_32      numBanks,  ///< [in] number of banks
+    UINT_32      numPipes   ///< [in] number of pipes
+    ) const
+{
+    UINT_32 rotation;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_2D_TILED_THIN1: // fall through
+        case ADDR_TM_2D_TILED_THICK: // fall through
+        case ADDR_TM_2D_TILED_XTHICK:
+        case ADDR_TM_PRT_2D_TILED_THIN1:
+        case ADDR_TM_PRT_2D_TILED_THICK:
+            // Rotate banks per Z-slice yBit 1 for 4-bank or 3 for 8-bank
+            rotation =  numBanks / 2 - 1;
+            break;
+        case ADDR_TM_3D_TILED_THIN1: // fall through
+        case ADDR_TM_3D_TILED_THICK: // fall through
+        case ADDR_TM_3D_TILED_XTHICK:
+        case ADDR_TM_PRT_3D_TILED_THIN1:
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            rotation = (numPipes < 4) ? 1 : (numPipes / 2 - 1);    // rotate pipes & banks
+            break;
+        default:
+            rotation = 0;
+    }
+
+    return rotation;
+}
+
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeHtileBytes
+*
+*   @brief
+*       Compute htile size in bytes
+*
+*   @return
+*       Htile size in bytes
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeHtileBytes(
+    UINT_32 pitch,        ///< [in] pitch
+    UINT_32 height,       ///< [in] height
+    UINT_32 bpp,          ///< [in] bits per pixel
+    BOOL_32 isLinear,     ///< [in] if it is linear mode
+    UINT_32 numSlices,    ///< [in] number of slices
+    UINT_64* sliceBytes,  ///< [out] bytes per slice
+    UINT_32 baseAlign     ///< [in] base alignments
+    ) const
+{
+    UINT_64 surfBytes;
+
+    const UINT_64 HtileCacheLineSize = BITS_TO_BYTES(HtileCacheBits);
+
+    *sliceBytes = BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * bpp / 64);
+
+    if (m_configFlags.useHtileSliceAlign)
+    {
+        // Align the sliceSize to htilecachelinesize * pipes at first
+        *sliceBytes = PowTwoAlign(*sliceBytes, HtileCacheLineSize * m_pipes);
+        surfBytes  = *sliceBytes * numSlices;
+    }
+    else
+    {
+        // Align the surfSize to htilecachelinesize * pipes at last
+        surfBytes  = *sliceBytes * numSlices;
+        surfBytes  = PowTwoAlign(surfBytes, HtileCacheLineSize * m_pipes);
+    }
+
+    return surfBytes;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeFmaskInfo
+*
+*   @brief
+*       Compute fmask sizes include padded pitch, height, slices, total size in bytes,
+*       meanwhile output suitable tile mode and alignments as well. Results are returned
+*       through output parameters.
+*
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::DispatchComputeFmaskInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,   ///< [in] input structure
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut)  ///< [out] output structure
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    ADDR_COMPUTE_SURFACE_INFO_INPUT  surfIn     = {0};
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT surfOut    = {0};
+
+    // Setup input structure
+    surfIn.tileMode          = pIn->tileMode;
+    surfIn.width             = pIn->pitch;
+    surfIn.height            = pIn->height;
+    surfIn.numSlices         = pIn->numSlices;
+    surfIn.pTileInfo         = pIn->pTileInfo;
+    surfIn.tileType          = ADDR_NON_DISPLAYABLE;
+    surfIn.flags.fmask       = 1;
+
+    // Setup output structure
+    surfOut.pTileInfo       = pOut->pTileInfo;
+
+    // Setup hwl specific fields
+    HwlFmaskPreThunkSurfInfo(pIn, pOut, &surfIn, &surfOut);
+
+    surfIn.bpp = HwlComputeFmaskBits(pIn, &surfIn.numSamples);
+
+    // ComputeSurfaceInfo needs numSamples in surfOut as surface routines need adjusted numSamples
+    surfOut.numSamples = surfIn.numSamples;
+
+    retCode = HwlComputeSurfaceInfo(&surfIn, &surfOut);
+
+    // Save bpp field for surface dump support
+    surfOut.bpp = surfIn.bpp;
+
+    if (retCode == ADDR_OK)
+    {
+        pOut->bpp               = surfOut.bpp;
+        pOut->pitch             = surfOut.pitch;
+        pOut->height            = surfOut.height;
+        pOut->numSlices         = surfOut.depth;
+        pOut->fmaskBytes        = surfOut.surfSize;
+        pOut->baseAlign         = surfOut.baseAlign;
+        pOut->pitchAlign        = surfOut.pitchAlign;
+        pOut->heightAlign       = surfOut.heightAlign;
+
+        if (surfOut.depth > 1)
+        {
+            // For fmask, expNumSlices is stored in depth.
+            pOut->sliceSize = surfOut.surfSize / surfOut.depth;
+        }
+        else
+        {
+            pOut->sliceSize = surfOut.surfSize;
+        }
+
+        // Save numSamples field for surface dump support
+        pOut->numSamples        = surfOut.numSamples;
+
+        HwlFmaskPostThunkSurfInfo(&surfOut, pOut);
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlFmaskSurfaceInfo
+*   @brief
+*       Entry of EgBasedAddrLib ComputeFmaskInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeFmaskInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pIn,   ///< [in] input structure
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT*         pOut   ///< [out] output structure
+    )
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    ADDR_TILEINFO tileInfo = {0};
+
+    // Use internal tile info if pOut does not have a valid pTileInfo
+    if (pOut->pTileInfo == NULL)
+    {
+        pOut->pTileInfo = &tileInfo;
+    }
+
+    retCode = DispatchComputeFmaskInfo(pIn, pOut);
+
+    if (retCode == ADDR_OK)
+    {
+        pOut->tileIndex =
+            HwlPostCheckTileIndex(pOut->pTileInfo, pIn->tileMode, ADDR_NON_DISPLAYABLE,
+                                  pOut->tileIndex);
+    }
+
+    // Resets pTileInfo to NULL if the internal tile info is used
+    if (pOut->pTileInfo == &tileInfo)
+    {
+        pOut->pTileInfo = NULL;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeFmaskAddrFromCoord
+*   @brief
+*       Entry of EgBasedAddrLib ComputeFmaskAddrFromCoord
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeFmaskAddrFromCoord(
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+#if ADDR_AM_BUILD
+    if ((pIn->x > pIn->pitch)               ||
+        (pIn->y > pIn->height)              ||
+        (pIn->numSamples > m_maxSamples)    ||
+        (pIn->sample >= m_maxSamples))
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        pOut->addr = DispatchComputeFmaskAddrFromCoord(pIn, pOut);
+    }
+#endif
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeFmaskCoordFromAddr
+*   @brief
+*       Entry of EgBasedAddrLib ComputeFmaskCoordFromAddr
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeFmaskCoordFromAddr(
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+#if ADDR_AM_BUILD
+    if ((pIn->bitPosition >= 8) ||
+        (pIn->numSamples > m_maxSamples))
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        DispatchComputeFmaskCoordFromAddr(pIn, pOut);
+    }
+#endif
+
+    return retCode;
+}
+
+#if ADDR_AM_BUILD
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeFmaskAddrFromCoord
+*
+*   @brief
+*       Computes the FMASK address and bit position from a coordinate.
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::DispatchComputeFmaskAddrFromCoord(
+    const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    UINT_32             x                 = pIn->x;
+    UINT_32             y                 = pIn->y;
+    UINT_32             slice             = pIn->slice;
+    UINT_32             sample            = pIn->sample;
+    UINT_32             plane             = pIn->plane;
+    UINT_32             pitch             = pIn->pitch;
+    UINT_32             height            = pIn->height;
+    UINT_32             numSamples        = pIn->numSamples;
+    AddrTileMode        tileMode          = pIn->tileMode;
+    BOOL_32             ignoreSE          = pIn->ignoreSE;
+    ADDR_TILEINFO*      pTileInfo         = pIn->pTileInfo;
+    BOOL_32             resolved          = pIn->resolved;
+
+    UINT_32* pBitPosition = &pOut->bitPosition;
+    UINT_64 addr          = 0;
+
+    ADDR_ASSERT(numSamples > 1);
+    ADDR_ASSERT(ComputeSurfaceThickness(tileMode) == 1);
+
+    switch (tileMode)
+    {
+        case ADDR_TM_1D_TILED_THIN1:
+            addr = ComputeFmaskAddrFromCoordMicroTiled(x,
+                                                       y,
+                                                       slice,
+                                                       sample,
+                                                       plane,
+                                                       pitch,
+                                                       height,
+                                                       numSamples,
+                                                       tileMode,
+                                                       resolved,
+                                                       pBitPosition);
+            break;
+        case ADDR_TM_2D_TILED_THIN1: //fall through
+        case ADDR_TM_3D_TILED_THIN1:
+            UINT_32 pipeSwizzle;
+            UINT_32 bankSwizzle;
+
+            if (m_configFlags.useCombinedSwizzle)
+            {
+                ExtractBankPipeSwizzle(pIn->tileSwizzle, pIn->pTileInfo,
+                                       &bankSwizzle, &pipeSwizzle);
+            }
+            else
+            {
+                pipeSwizzle = pIn->pipeSwizzle;
+                bankSwizzle = pIn->bankSwizzle;
+            }
+
+            addr = ComputeFmaskAddrFromCoordMacroTiled(x,
+                                                       y,
+                                                       slice,
+                                                       sample,
+                                                       plane,
+                                                       pitch,
+                                                       height,
+                                                       numSamples,
+                                                       tileMode,
+                                                       pipeSwizzle,
+                                                       bankSwizzle,
+                                                       ignoreSE,
+                                                       pTileInfo,
+                                                       resolved,
+                                                       pBitPosition);
+            break;
+        default:
+            *pBitPosition = 0;
+            break;
+    }
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskAddrFromCoordMicroTiled
+*
+*   @brief
+*       Computes the FMASK address and bit position from a coordinate for 1D tilied (micro
+*       tiled)
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeFmaskAddrFromCoordMicroTiled(
+    UINT_32             x,              ///< [in] x coordinate
+    UINT_32             y,              ///< [in] y coordinate
+    UINT_32             slice,          ///< [in] slice index
+    UINT_32             sample,         ///< [in] sample number
+    UINT_32             plane,          ///< [in] plane number
+    UINT_32             pitch,          ///< [in] surface pitch in pixels
+    UINT_32             height,         ///< [in] surface height in pixels
+    UINT_32             numSamples,     ///< [in] number of samples
+    AddrTileMode        tileMode,       ///< [in] tile mode
+    BOOL_32             resolved,       ///< [in] TRUE if this is for resolved fmask
+    UINT_32*            pBitPosition    ///< [out] pointer to returned bit position
+    ) const
+{
+    UINT_64 addr = 0;
+    UINT_32 effectiveBpp;
+    UINT_32 effectiveSamples;
+
+    //
+    // 2xAA use the same layout as 4xAA
+    //
+    if (numSamples == 2)
+    {
+        numSamples = 4;
+    }
+
+    //
+    // Compute the number of planes.
+    //
+    if (!resolved)
+    {
+        effectiveSamples = ComputeFmaskNumPlanesFromNumSamples(numSamples);;
+        effectiveBpp = numSamples;
+
+        //
+        // Compute the address just like a color surface with numSamples bits per element and
+        // numPlanes samples.
+        //
+        addr = ComputeSurfaceAddrFromCoordMicroTiled(x,
+                                                     y,
+                                                     slice,
+                                                     plane, // sample
+                                                     effectiveBpp,
+                                                     pitch,
+                                                     height,
+                                                     effectiveSamples,
+                                                     tileMode,
+                                                     ADDR_NON_DISPLAYABLE,
+                                                     FALSE,
+                                                     pBitPosition);
+
+        //
+        // Compute the real bit position. Each (sample, plane) is stored with one bit per sample.
+        //
+
+        //
+        // Compute the pixel index with in the micro tile
+        //
+        UINT_32 pixelIndex = ComputePixelIndexWithinMicroTile(x % 8,
+                                                              y % 8,
+                                                              slice,
+                                                              1,
+                                                              tileMode,
+                                                              ADDR_NON_DISPLAYABLE);
+
+        *pBitPosition = ((pixelIndex * numSamples) + sample) & (BITS_PER_BYTE-1);
+
+        UINT_64 bitAddr = BYTES_TO_BITS(addr) + *pBitPosition;
+
+        addr = bitAddr / 8;
+    }
+    else
+    {
+        effectiveBpp = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+        effectiveSamples = 1;
+
+        //
+        // Compute the address just like a color surface with numSamples bits per element and
+        // numPlanes samples.
+        //
+        addr = ComputeSurfaceAddrFromCoordMicroTiled(x,
+                                                     y,
+                                                     slice,
+                                                     sample,
+                                                     effectiveBpp,
+                                                     pitch,
+                                                     height,
+                                                     effectiveSamples,
+                                                     tileMode,
+                                                     ADDR_NON_DISPLAYABLE,
+                                                     TRUE,
+                                                     pBitPosition);
+    }
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskAddrFromCoordMacroTiled
+*
+*   @brief
+*       Computes the FMASK address and bit position from a coordinate for 2D tilied (macro
+*       tiled)
+*   @return
+*       The byte address
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::ComputeFmaskAddrFromCoordMacroTiled(
+    UINT_32             x,              ///< [in] x coordinate
+    UINT_32             y,              ///< [in] y coordinate
+    UINT_32             slice,          ///< [in] slice index
+    UINT_32             sample,         ///< [in] sample number
+    UINT_32             plane,          ///< [in] plane number
+    UINT_32             pitch,          ///< [in] surface pitch in pixels
+    UINT_32             height,         ///< [in] surface height in pixels
+    UINT_32             numSamples,     ///< [in] number of samples
+    AddrTileMode        tileMode,       ///< [in] tile mode
+    UINT_32             pipeSwizzle,    ///< [in] pipe swizzle
+    UINT_32             bankSwizzle,    ///< [in] bank swizzle
+    BOOL_32             ignoreSE,       ///< [in] TRUE if ignore shader engine
+    ADDR_TILEINFO*      pTileInfo,      ///< [in] bank structure.**All fields to be valid on entry**
+    BOOL_32             resolved,       ///< [in] TRUE if this is for resolved fmask
+    UINT_32*            pBitPosition    ///< [out] pointer to returned bit position
+    ) const
+{
+    UINT_64 addr = 0;
+    UINT_32 effectiveBpp;
+    UINT_32 effectiveSamples;
+
+    //
+    // 2xAA use the same layout as 4xAA
+    //
+    if (numSamples == 2)
+    {
+        numSamples = 4;
+    }
+
+    //
+    // Compute the number of planes.
+    //
+    if (!resolved)
+    {
+        effectiveSamples = ComputeFmaskNumPlanesFromNumSamples(numSamples);
+        effectiveBpp = numSamples;
+
+        //
+        // Compute the address just like a color surface with numSamples bits per element and
+        // numPlanes samples.
+        //
+        addr = ComputeSurfaceAddrFromCoordMacroTiled(x,
+                                                     y,
+                                                     slice,
+                                                     plane, // sample
+                                                     effectiveBpp,
+                                                     pitch,
+                                                     height,
+                                                     effectiveSamples,
+                                                     tileMode,
+                                                     ADDR_NON_DISPLAYABLE,// isdisp
+                                                     ignoreSE,// ignore_shader
+                                                     FALSE,// depth_sample_order
+                                                     pipeSwizzle,
+                                                     bankSwizzle,
+                                                     pTileInfo,
+                                                     pBitPosition);
+
+        //
+        // Compute the real bit position. Each (sample, plane) is stored with one bit per sample.
+        //
+
+
+        //
+        // Compute the pixel index with in the micro tile
+        //
+        UINT_32 pixelIndex = ComputePixelIndexWithinMicroTile(x ,
+                                                              y ,
+                                                              slice,
+                                                              effectiveBpp,
+                                                              tileMode,
+                                                              ADDR_NON_DISPLAYABLE);
+
+        *pBitPosition = ((pixelIndex * numSamples) + sample) & (BITS_PER_BYTE-1);
+
+        UINT_64 bitAddr = BYTES_TO_BITS(addr) + *pBitPosition;
+
+        addr = bitAddr / 8;
+
+    }
+    else
+    {
+        effectiveBpp = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+        effectiveSamples = 1;
+
+        //
+        // Compute the address just like a color surface with numSamples bits per element and
+        // numPlanes samples.
+        //
+        addr = ComputeSurfaceAddrFromCoordMacroTiled(x,
+                                                     y,
+                                                     slice,
+                                                     sample,
+                                                     effectiveBpp,
+                                                     pitch,
+                                                     height,
+                                                     effectiveSamples,
+                                                     tileMode,
+                                                     ADDR_NON_DISPLAYABLE,
+                                                     ignoreSE,
+                                                     TRUE,
+                                                     pipeSwizzle,
+                                                     bankSwizzle,
+                                                     pTileInfo,
+                                                     pBitPosition);
+    }
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskCoordFromAddrMicroTiled
+*
+*   @brief
+*       Compute (x,y,slice,sample,plane) coordinates from fmask address
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ComputeFmaskCoordFromAddrMicroTiled(
+    UINT_64             addr,       ///< [in] byte address
+    UINT_32             bitPosition,///< [in] bit position
+    UINT_32             pitch,      ///< [in] pitch in pixels
+    UINT_32             height,     ///< [in] height in pixels
+    UINT_32             numSamples, ///< [in] number of samples (of color buffer)
+    AddrTileMode        tileMode,   ///< [in] tile mode
+    BOOL_32             resolved,   ///< [in] TRUE if it is resolved fmask
+    UINT_32*            pX,         ///< [out] X coord
+    UINT_32*            pY,         ///< [out] Y coord
+    UINT_32*            pSlice,     ///< [out] slice index
+    UINT_32*            pSample,    ///< [out] sample index
+    UINT_32*            pPlane      ///< [out] plane index
+    ) const
+{
+    UINT_32 effectiveBpp;
+    UINT_32 effectiveSamples;
+
+    // 2xAA use the same layout as 4xAA
+    if (numSamples == 2)
+    {
+        numSamples = 4;
+    }
+
+    if (!resolved)
+    {
+        effectiveSamples = ComputeFmaskNumPlanesFromNumSamples(numSamples);
+        effectiveBpp  = numSamples;
+
+        ComputeSurfaceCoordFromAddrMicroTiled(addr,
+                                              bitPosition,
+                                              effectiveBpp,
+                                              pitch,
+                                              height,
+                                              effectiveSamples,
+                                              tileMode,
+                                              0, // tileBase
+                                              0, // compBits
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pPlane,
+                                              ADDR_NON_DISPLAYABLE, // microTileType
+                                              FALSE  // isDepthSampleOrder
+                                              );
+
+
+        if ( pSample )
+        {
+            *pSample = bitPosition % numSamples;
+        }
+    }
+    else
+    {
+        effectiveBpp = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+        effectiveSamples = 1;
+
+        ComputeSurfaceCoordFromAddrMicroTiled(addr,
+                                              bitPosition,
+                                              effectiveBpp,
+                                              pitch,
+                                              height,
+                                              effectiveSamples,
+                                              tileMode,
+                                              0,     // tileBase
+                                              0,     // compBits
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pSample,
+                                              ADDR_NON_DISPLAYABLE, // microTileType
+                                              TRUE   // isDepthSampleOrder
+                                              );
+    }
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskCoordFromAddrMacroTiled
+*
+*   @brief
+*       Compute (x,y,slice,sample,plane) coordinates from
+*       fmask address
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::ComputeFmaskCoordFromAddrMacroTiled(
+    UINT_64             addr,       ///< [in] byte address
+    UINT_32             bitPosition,///< [in] bit position
+    UINT_32             pitch,      ///< [in] pitch in pixels
+    UINT_32             height,     ///< [in] height in pixels
+    UINT_32             numSamples, ///< [in] number of samples (of color buffer)
+    AddrTileMode        tileMode,   ///< [in] tile mode
+    UINT_32             pipeSwizzle,///< [in] pipe swizzle
+    UINT_32             bankSwizzle,///< [in] bank swizzle
+    BOOL_32             ignoreSE,   ///< [in] TRUE if ignore shader engine
+    ADDR_TILEINFO*      pTileInfo,  ///< [in] bank structure. **All fields to be valid on entry**
+    BOOL_32             resolved,   ///< [in] TRUE if it is resolved fmask
+    UINT_32*            pX,         ///< [out] X coord
+    UINT_32*            pY,         ///< [out] Y coord
+    UINT_32*            pSlice,     ///< [out] slice index
+    UINT_32*            pSample,    ///< [out] sample index
+    UINT_32*            pPlane      ///< [out] plane index
+    ) const
+{
+    UINT_32 effectiveBpp;
+    UINT_32 effectiveSamples;
+
+    // 2xAA use the same layout as 4xAA
+    if (numSamples == 2)
+    {
+        numSamples = 4;
+    }
+
+    //
+    // Compute the number of planes.
+    //
+    if (!resolved)
+    {
+        effectiveSamples = ComputeFmaskNumPlanesFromNumSamples(numSamples);
+        effectiveBpp  = numSamples;
+
+        ComputeSurfaceCoordFromAddrMacroTiled(addr,
+                                              bitPosition,
+                                              effectiveBpp,
+                                              pitch,
+                                              height,
+                                              effectiveSamples,
+                                              tileMode,
+                                              0, // No tileBase
+                                              0, // No compBits
+                                              ADDR_NON_DISPLAYABLE,
+                                              ignoreSE,
+                                              FALSE,
+                                              pipeSwizzle,
+                                              bankSwizzle,
+                                              pTileInfo,
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pPlane);
+
+        if (pSample)
+        {
+            *pSample = bitPosition % numSamples;
+        }
+    }
+    else
+    {
+        effectiveBpp = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+        effectiveSamples = 1;
+
+        ComputeSurfaceCoordFromAddrMacroTiled(addr,
+                                              bitPosition,
+                                              effectiveBpp,
+                                              pitch,
+                                              height,
+                                              effectiveSamples,
+                                              tileMode,
+                                              0, // No tileBase
+                                              0, // No compBits
+                                              ADDR_NON_DISPLAYABLE,
+                                              ignoreSE,
+                                              TRUE,
+                                              pipeSwizzle,
+                                              bankSwizzle,
+                                              pTileInfo,
+                                              pX,
+                                              pY,
+                                              pSlice,
+                                              pSample);
+    }
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::DispatchComputeFmaskCoordFromAddr
+*
+*   @brief
+*       Compute (x,y,slice,sample,plane) coordinates from
+*       fmask address
+*   @return
+*       N/A
+*
+***************************************************************************************************
+*/
+VOID EgBasedAddrLib::DispatchComputeFmaskCoordFromAddr(
+    const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT*   pIn,    ///< [in] input structure
+    ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT*        pOut    ///< [out] output structure
+    ) const
+{
+    UINT_64             addr              = pIn->addr;
+    UINT_32             bitPosition       = pIn->bitPosition;
+    UINT_32             pitch             = pIn->pitch;
+    UINT_32             height            = pIn->height;
+    UINT_32             numSamples        = pIn->numSamples;
+    AddrTileMode        tileMode          = pIn->tileMode;
+    BOOL_32             ignoreSE          = pIn->ignoreSE;
+    ADDR_TILEINFO*      pTileInfo         = pIn->pTileInfo;
+    BOOL_32             resolved          = pIn->resolved;
+
+    UINT_32*            pX      = &pOut->x;
+    UINT_32*            pY      = &pOut->y;
+    UINT_32*            pSlice  = &pOut->slice;
+    UINT_32*            pSample = &pOut->sample;
+    UINT_32*            pPlane  = &pOut->plane;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_1D_TILED_THIN1:
+            ComputeFmaskCoordFromAddrMicroTiled(addr,
+                                                bitPosition,
+                                                pitch,
+                                                height,
+                                                numSamples,
+                                                tileMode,
+                                                resolved,
+                                                pX,
+                                                pY,
+                                                pSlice,
+                                                pSample,
+                                                pPlane);
+            break;
+        case ADDR_TM_2D_TILED_THIN1://fall through
+        case ADDR_TM_3D_TILED_THIN1:
+            UINT_32 pipeSwizzle;
+            UINT_32 bankSwizzle;
+
+            if (m_configFlags.useCombinedSwizzle)
+            {
+                ExtractBankPipeSwizzle(pIn->tileSwizzle, pIn->pTileInfo,
+                                       &bankSwizzle, &pipeSwizzle);
+            }
+            else
+            {
+                pipeSwizzle = pIn->pipeSwizzle;
+                bankSwizzle = pIn->bankSwizzle;
+            }
+
+            ComputeFmaskCoordFromAddrMacroTiled(addr,
+                                                bitPosition,
+                                                pitch,
+                                                height,
+                                                numSamples,
+                                                tileMode,
+                                                pipeSwizzle,
+                                                bankSwizzle,
+                                                ignoreSE,
+                                                pTileInfo,
+                                                resolved,
+                                                pX,
+                                                pY,
+                                                pSlice,
+                                                pSample,
+                                                pPlane);
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+            break;
+
+    }
+}
+#endif
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskNumPlanesFromNumSamples
+*
+*   @brief
+*       Compute fmask number of planes from number of samples
+*
+*   @return
+*       Number of planes
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeFmaskNumPlanesFromNumSamples(
+    UINT_32 numSamples)     ///< [in] number of samples
+{
+    UINT_32 numPlanes;
+
+    //
+    // FMASK is stored such that each micro tile is composed of elements containing N bits, where
+    // N is the number of samples.  There is a micro tile for each bit in the FMASK address, and
+    // micro tiles for each address bit, sometimes referred to as a plane, are stored sequentially.
+    // The FMASK for a 2-sample surface looks like a general surface with 2 bits per element.
+    // The FMASK for a 4-sample surface looks like a general surface with 4 bits per element and
+    // 2 samples.  The FMASK for an 8-sample surface looks like a general surface with 8 bits per
+    // element and 4 samples.  R6xx and R7xx only stored 3 planes for 8-sample FMASK surfaces.
+    // This was changed for R8xx to simplify the logic in the CB.
+    //
+    switch (numSamples)
+    {
+        case 2:
+            numPlanes = 1;
+            break;
+        case 4:
+            numPlanes = 2;
+            break;
+        case 8:
+            numPlanes = 4;
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+            numPlanes = 0;
+            break;
+    }
+    return numPlanes;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::ComputeFmaskResolvedBppFromNumSamples
+*
+*   @brief
+*       Compute resolved fmask effective bpp based on number of samples
+*
+*   @return
+*       bpp
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::ComputeFmaskResolvedBppFromNumSamples(
+    UINT_32 numSamples)     ///< number of samples
+{
+    UINT_32 bpp;
+
+    //
+    // Resolved FMASK surfaces are generated yBit the CB and read yBit the texture unit
+    // so that the texture unit can read compressed multi-sample color data.
+    // These surfaces store each index value packed per element.
+    // Each element contains at least num_samples * log2(num_samples) bits.
+    // Resolved FMASK surfaces are addressed as follows:
+    // 2-sample Addressed similarly to a color surface with 8 bits per element and 1 sample.
+    // 4-sample Addressed similarly to a color surface with 8 bits per element and 1 sample.
+    // 8-sample Addressed similarly to a color surface with 32 bits per element and 1 sample.
+
+    switch (numSamples)
+    {
+        case 2:
+            bpp = 8;
+            break;
+        case 4:
+            bpp = 8;
+            break;
+        case 8:
+            bpp = 32;
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+            bpp = 0;
+            break;
+    }
+    return bpp;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::IsTileInfoAllZero
+*
+*   @brief
+*       Return TRUE if all field are zero
+*   @note
+*       Since NULL input is consider to be all zero
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::IsTileInfoAllZero(
+    ADDR_TILEINFO* pTileInfo)
+{
+    BOOL_32 allZero = TRUE;
+
+    if (pTileInfo)
+    {
+        if ((pTileInfo->banks            != 0)  ||
+            (pTileInfo->bankWidth        != 0)  ||
+            (pTileInfo->bankHeight       != 0)  ||
+            (pTileInfo->macroAspectRatio != 0)  ||
+            (pTileInfo->tileSplitBytes   != 0)  ||
+            (pTileInfo->pipeConfig       != 0)
+            )
+        {
+            allZero = FALSE;
+        }
+    }
+
+    return allZero;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlTileInfoEqual
+*
+*   @brief
+*       Return TRUE if all field are equal
+*   @note
+*       Only takes care of current HWL's data
+***************************************************************************************************
+*/
+BOOL_32 EgBasedAddrLib::HwlTileInfoEqual(
+    const ADDR_TILEINFO* pLeft, ///<[in] Left compare operand
+    const ADDR_TILEINFO* pRight ///<[in] Right compare operand
+    ) const
+{
+    BOOL_32 equal = FALSE;
+
+    if (pLeft->banks == pRight->banks           &&
+        pLeft->bankWidth == pRight->bankWidth   &&
+        pLeft->bankHeight == pRight->bankHeight &&
+        pLeft->macroAspectRatio == pRight->macroAspectRatio &&
+        pLeft->tileSplitBytes == pRight->tileSplitBytes)
+    {
+        equal = TRUE;
+    }
+
+    return equal;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlConvertTileInfoToHW
+*   @brief
+*       Entry of EgBasedAddrLib ConvertTileInfoToHW
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlConvertTileInfoToHW(
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn, ///< [in] input structure
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode   = ADDR_OK;
+
+    ADDR_TILEINFO *pTileInfoIn  = pIn->pTileInfo;
+    ADDR_TILEINFO *pTileInfoOut = pOut->pTileInfo;
+
+    if ((pTileInfoIn != NULL) && (pTileInfoOut != NULL))
+    {
+        if (pIn->reverse == FALSE)
+        {
+            switch (pTileInfoIn->banks)
+            {
+                case 2:
+                    pTileInfoOut->banks = 0;
+                    break;
+                case 4:
+                    pTileInfoOut->banks = 1;
+                    break;
+                case 8:
+                    pTileInfoOut->banks = 2;
+                    break;
+                case 16:
+                    pTileInfoOut->banks = 3;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->banks = 0;
+                    break;
+            }
+
+            switch (pTileInfoIn->bankWidth)
+            {
+                case 1:
+                    pTileInfoOut->bankWidth = 0;
+                    break;
+                case 2:
+                    pTileInfoOut->bankWidth = 1;
+                    break;
+                case 4:
+                    pTileInfoOut->bankWidth = 2;
+                    break;
+                case 8:
+                    pTileInfoOut->bankWidth = 3;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->bankWidth = 0;
+                    break;
+            }
+
+            switch (pTileInfoIn->bankHeight)
+            {
+                case 1:
+                    pTileInfoOut->bankHeight = 0;
+                    break;
+                case 2:
+                    pTileInfoOut->bankHeight = 1;
+                    break;
+                case 4:
+                    pTileInfoOut->bankHeight = 2;
+                    break;
+                case 8:
+                    pTileInfoOut->bankHeight = 3;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->bankHeight = 0;
+                    break;
+            }
+
+            switch (pTileInfoIn->macroAspectRatio)
+            {
+                case 1:
+                    pTileInfoOut->macroAspectRatio = 0;
+                    break;
+                case 2:
+                    pTileInfoOut->macroAspectRatio = 1;
+                    break;
+                case 4:
+                    pTileInfoOut->macroAspectRatio = 2;
+                    break;
+                case 8:
+                    pTileInfoOut->macroAspectRatio = 3;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->macroAspectRatio = 0;
+                    break;
+            }
+
+            switch (pTileInfoIn->tileSplitBytes)
+            {
+                case 64:
+                    pTileInfoOut->tileSplitBytes = 0;
+                    break;
+                case 128:
+                    pTileInfoOut->tileSplitBytes = 1;
+                    break;
+                case 256:
+                    pTileInfoOut->tileSplitBytes = 2;
+                    break;
+                case 512:
+                    pTileInfoOut->tileSplitBytes = 3;
+                    break;
+                case 1024:
+                    pTileInfoOut->tileSplitBytes = 4;
+                    break;
+                case 2048:
+                    pTileInfoOut->tileSplitBytes = 5;
+                    break;
+                case 4096:
+                    pTileInfoOut->tileSplitBytes = 6;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->tileSplitBytes = 0;
+                    break;
+            }
+        }
+        else
+        {
+            switch (pTileInfoIn->banks)
+            {
+                case 0:
+                    pTileInfoOut->banks = 2;
+                    break;
+                case 1:
+                    pTileInfoOut->banks = 4;
+                    break;
+                case 2:
+                    pTileInfoOut->banks = 8;
+                    break;
+                case 3:
+                    pTileInfoOut->banks = 16;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->banks = 2;
+                    break;
+            }
+
+            switch (pTileInfoIn->bankWidth)
+            {
+                case 0:
+                    pTileInfoOut->bankWidth = 1;
+                    break;
+                case 1:
+                    pTileInfoOut->bankWidth = 2;
+                    break;
+                case 2:
+                    pTileInfoOut->bankWidth = 4;
+                    break;
+                case 3:
+                    pTileInfoOut->bankWidth = 8;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->bankWidth = 1;
+                    break;
+            }
+
+            switch (pTileInfoIn->bankHeight)
+            {
+                case 0:
+                    pTileInfoOut->bankHeight = 1;
+                    break;
+                case 1:
+                    pTileInfoOut->bankHeight = 2;
+                    break;
+                case 2:
+                    pTileInfoOut->bankHeight = 4;
+                    break;
+                case 3:
+                    pTileInfoOut->bankHeight = 8;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->bankHeight = 1;
+                    break;
+            }
+
+            switch (pTileInfoIn->macroAspectRatio)
+            {
+                case 0:
+                    pTileInfoOut->macroAspectRatio = 1;
+                    break;
+                case 1:
+                    pTileInfoOut->macroAspectRatio = 2;
+                    break;
+                case 2:
+                    pTileInfoOut->macroAspectRatio = 4;
+                    break;
+                case 3:
+                    pTileInfoOut->macroAspectRatio = 8;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->macroAspectRatio = 1;
+                    break;
+            }
+
+            switch (pTileInfoIn->tileSplitBytes)
+            {
+                case 0:
+                    pTileInfoOut->tileSplitBytes = 64;
+                    break;
+                case 1:
+                    pTileInfoOut->tileSplitBytes = 128;
+                    break;
+                case 2:
+                    pTileInfoOut->tileSplitBytes = 256;
+                    break;
+                case 3:
+                    pTileInfoOut->tileSplitBytes = 512;
+                    break;
+                case 4:
+                    pTileInfoOut->tileSplitBytes = 1024;
+                    break;
+                case 5:
+                    pTileInfoOut->tileSplitBytes = 2048;
+                    break;
+                case 6:
+                    pTileInfoOut->tileSplitBytes = 4096;
+                    break;
+                default:
+                    ADDR_ASSERT_ALWAYS();
+                    retCode = ADDR_INVALIDPARAMS;
+                    pTileInfoOut->tileSplitBytes = 64;
+                    break;
+            }
+        }
+
+        if (pTileInfoIn != pTileInfoOut)
+        {
+            pTileInfoOut->pipeConfig = pTileInfoIn->pipeConfig;
+        }
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        retCode = ADDR_INVALIDPARAMS;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeSurfaceInfo
+*   @brief
+*       Entry of EgBasedAddrLib ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeSurfaceInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if (pIn->numSamples < pIn->numFrags)
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+
+    ADDR_TILEINFO tileInfo = {0};
+
+    if (retCode == ADDR_OK)
+    {
+        // Uses internal tile info if pOut does not have a valid pTileInfo
+        if (pOut->pTileInfo == NULL)
+        {
+            pOut->pTileInfo = &tileInfo;
+        }
+
+        if (!DispatchComputeSurfaceInfo(pIn, pOut))
+        {
+            retCode = ADDR_INVALIDPARAMS;
+        }
+
+        // Returns an index
+        pOut->tileIndex = HwlPostCheckTileIndex(pOut->pTileInfo,
+                                                pOut->tileMode,
+                                                pOut->tileType,
+                                                pOut->tileIndex);
+
+        if (IsMacroTiled(pOut->tileMode) && (pOut->macroModeIndex == TileIndexInvalid))
+        {
+            pOut->macroModeIndex = HwlComputeMacroModeIndex(pOut->tileIndex,
+                                                            pIn->flags,
+                                                            pIn->bpp,
+                                                            pIn->numSamples,
+                                                            pOut->pTileInfo);
+        }
+
+        // Resets pTileInfo to NULL if the internal tile info is used
+        if (pOut->pTileInfo == &tileInfo)
+        {
+#if DEBUG
+            // Client does not pass in a valid pTileInfo
+            if (IsMacroTiled(pOut->tileMode))
+            {
+                // If a valid index is returned, then no pTileInfo is okay
+                ADDR_ASSERT(!m_configFlags.useTileIndex || pOut->tileIndex != TileIndexInvalid);
+
+                if (!IsTileInfoAllZero(pIn->pTileInfo))
+                {
+                    // The initial value of pIn->pTileInfo is copied to tileInfo
+                    // We do not expect any of these value to be changed nor any 0 of inputs
+                    ADDR_ASSERT(tileInfo.banks == pIn->pTileInfo->banks);
+                    ADDR_ASSERT(tileInfo.bankWidth == pIn->pTileInfo->bankWidth);
+                    ADDR_ASSERT(tileInfo.bankHeight == pIn->pTileInfo->bankHeight);
+                    ADDR_ASSERT(tileInfo.macroAspectRatio == pIn->pTileInfo->macroAspectRatio);
+                    ADDR_ASSERT(tileInfo.tileSplitBytes == pIn->pTileInfo->tileSplitBytes);
+                }
+            }
+#endif
+            pOut->pTileInfo = NULL;
+        }
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeSurfaceAddrFromCoord
+*   @brief
+*       Entry of EgBasedAddrLib ComputeSurfaceAddrFromCoord
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeSurfaceAddrFromCoord(
+    const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if (
+#if !ALT_TEST // Overflow test needs this out-of-boundary coord
+        (pIn->x > pIn->pitch)   ||
+        (pIn->y > pIn->height)  ||
+#endif
+        (pIn->numSamples > m_maxSamples))
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        pOut->addr = DispatchComputeSurfaceAddrFromCoord(pIn, pOut);
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeSurfaceCoordFromAddr
+*   @brief
+*       Entry of EgBasedAddrLib ComputeSurfaceCoordFromAddr
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeSurfaceCoordFromAddr(
+    const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT*      pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if ((pIn->bitPosition >= 8) ||
+        (pIn->numSamples > m_maxSamples))
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+    else
+    {
+        DispatchComputeSurfaceCoordFromAddr(pIn, pOut);
+    }
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeSliceTileSwizzle
+*   @brief
+*       Entry of EgBasedAddrLib ComputeSurfaceCoordFromAddr
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE EgBasedAddrLib::HwlComputeSliceTileSwizzle(
+    const ADDR_COMPUTE_SLICESWIZZLE_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SLICESWIZZLE_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode = ADDR_OK;
+
+    if (pIn->pTileInfo && (pIn->pTileInfo->banks > 0))
+    {
+
+        pOut->tileSwizzle = ComputeSliceTileSwizzle(pIn->tileMode,
+                                                    pIn->baseSwizzle,
+                                                    pIn->slice,
+                                                    pIn->baseAddr,
+                                                    pIn->pTileInfo);
+    }
+    else
+    {
+        retCode = ADDR_INVALIDPARAMS;
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeHtileBpp
+*
+*   @brief
+*       Compute htile bpp
+*
+*   @return
+*       Htile bpp
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::HwlComputeHtileBpp(
+    BOOL_32 isWidth8,   ///< [in] TRUE if block width is 8
+    BOOL_32 isHeight8   ///< [in] TRUE if block height is 8
+    ) const
+{
+    // only support 8x8 mode
+    ADDR_ASSERT(isWidth8 && isHeight8);
+    return 32;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlComputeHtileBaseAlign
+*
+*   @brief
+*       Compute htile base alignment
+*
+*   @return
+*       Htile base alignment
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::HwlComputeHtileBaseAlign(
+    BOOL_32         isTcCompatible, ///< [in] if TC compatible
+    BOOL_32         isLinear,       ///< [in] if it is linear mode
+    ADDR_TILEINFO*  pTileInfo       ///< [in] Tile info
+    ) const
+{
+    UINT_32 baseAlign = m_pipeInterleaveBytes * HwlGetPipes(pTileInfo);
+
+    if (isTcCompatible)
+    {
+        ADDR_ASSERT(pTileInfo != NULL);
+        if (pTileInfo)
+        {
+            baseAlign *= pTileInfo->banks;
+        }
+    }
+
+    return baseAlign;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlGetPitchAlignmentMicroTiled
+*
+*   @brief
+*       Compute 1D tiled surface pitch alignment, calculation results are returned through
+*       output parameters.
+*
+*   @return
+*       pitch alignment
+***************************************************************************************************
+*/
+UINT_32 EgBasedAddrLib::HwlGetPitchAlignmentMicroTiled(
+    AddrTileMode        tileMode,          ///< [in] tile mode
+    UINT_32             bpp,               ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,             ///< [in] surface flags
+    UINT_32             numSamples         ///< [in] number of samples
+    ) const
+{
+    UINT_32 pitchAlign;
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    UINT_32 pixelsPerMicroTile;
+    UINT_32 pixelsPerPipeInterleave;
+    UINT_32 microTilesPerPipeInterleave;
+
+    //
+    // Special workaround for depth/stencil buffer, use 8 bpp to meet larger requirement for
+    // stencil buffer since pitch alignment is related to bpp.
+    // For a depth only buffer do not set this.
+    //
+    // Note: this actually does not work for mipmap but mipmap depth texture is not really
+    // sampled with mipmap.
+    //
+    if (flags.depth && !flags.noStencil)
+    {
+        bpp = 8;
+    }
+
+    pixelsPerMicroTile = MicroTilePixels * microTileThickness;
+    pixelsPerPipeInterleave = BYTES_TO_BITS(m_pipeInterleaveBytes) / (bpp * numSamples);
+    microTilesPerPipeInterleave = pixelsPerPipeInterleave / pixelsPerMicroTile;
+
+    pitchAlign = Max(MicroTileWidth, microTilesPerPipeInterleave * MicroTileWidth);
+
+    return pitchAlign;
+}
+
+/**
+***************************************************************************************************
+*   EgBasedAddrLib::HwlGetSizeAdjustmentMicroTiled
+*
+*   @brief
+*       Adjust 1D tiled surface pitch and slice size
+*
+*   @return
+*       Logical slice size in bytes
+***************************************************************************************************
+*/
+UINT_64 EgBasedAddrLib::HwlGetSizeAdjustmentMicroTiled(
+    UINT_32             thickness,      ///< [in] thickness
+    UINT_32             bpp,            ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,          ///< [in] surface flags
+    UINT_32             numSamples,     ///< [in] number of samples
+    UINT_32             baseAlign,      ///< [in] base alignment
+    UINT_32             pitchAlign,     ///< [in] pitch alignment
+    UINT_32*            pPitch,         ///< [in/out] pointer to pitch
+    UINT_32*            pHeight         ///< [in/out] pointer to height
+    ) const
+{
+    UINT_64 logicalSliceSize;
+    UINT_64 physicalSliceSize;
+
+    UINT_32 pitch   = *pPitch;
+    UINT_32 height  = *pHeight;
+
+    // Logical slice: pitch * height * bpp * numSamples (no 1D MSAA so actually numSamples == 1)
+    logicalSliceSize = BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * bpp * numSamples);
+
+    // Physical slice: multiplied by thickness
+    physicalSliceSize =  logicalSliceSize * thickness;
+
+    //
+    // R800 will always pad physical slice size to baseAlign which is pipe_interleave_bytes
+    //
+    ADDR_ASSERT((physicalSliceSize % baseAlign) == 0)
+
+    return logicalSliceSize;
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.h
new file mode 100644
index 00000000000..84adb66eedc
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/egbaddrlib.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  egbaddrlib.h
+* @brief Contains the EgBasedAddrLib class definition.
+***************************************************************************************************
+*/
+
+#ifndef __EG_BASED_ADDR_LIB_H__
+#define __EG_BASED_ADDR_LIB_H__
+
+#include "addrlib.h"
+
+
+/// Structures for functions
+struct CoordFromBankPipe
+{
+    UINT_32 xBits : 3;
+    UINT_32 yBits : 4;
+
+    UINT_32 xBit3 : 1;
+    UINT_32 xBit4 : 1;
+    UINT_32 xBit5 : 1;
+    UINT_32 yBit3 : 1;
+    UINT_32 yBit4 : 1;
+    UINT_32 yBit5 : 1;
+    UINT_32 yBit6 : 1;
+};
+
+/**
+***************************************************************************************************
+* @brief This class is the Evergreen based address library
+* @note  Abstract class
+***************************************************************************************************
+*/
+class EgBasedAddrLib : public AddrLib
+{
+protected:
+    EgBasedAddrLib(const AddrClient* pClient);
+    virtual ~EgBasedAddrLib();
+
+public:
+
+    /// Surface info functions
+
+    // NOTE: DispatchComputeSurfaceInfo using TileInfo takes both an input and an output.
+    //       On input:
+    //       One or more fields may be 0 to be calculated/defaulted - pre-SI h/w.
+    //       H/W using tile mode index only accepts none or all 0's - SI and newer h/w.
+    //       It then returns the actual tiling configuration used.
+    //       Other methods' TileInfo must be valid on entry
+    BOOL_32 DispatchComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    ADDR_E_RETURNCODE DispatchComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut);
+
+protected:
+    // Hwl interface
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceAddrFromCoord(
+        const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceCoordFromAddr(
+        const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeSliceTileSwizzle(
+        const ADDR_COMPUTE_SLICESWIZZLE_INPUT* pIn,
+        ADDR_COMPUTE_SLICESWIZZLE_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlExtractBankPipeSwizzle(
+        const ADDR_EXTRACT_BANKPIPE_SWIZZLE_INPUT* pIn,
+        ADDR_EXTRACT_BANKPIPE_SWIZZLE_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlCombineBankPipeSwizzle(
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, ADDR_TILEINFO*  pTileInfo,
+        UINT_64 baseAddr, UINT_32* pTileSwizzle) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeBaseSwizzle(
+        const ADDR_COMPUTE_BASE_SWIZZLE_INPUT* pIn,
+        ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlConvertTileInfoToHW(
+        const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn,
+        ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut) const;
+
+    virtual UINT_32 HwlComputeHtileBpp(
+        BOOL_32 isWidth8, BOOL_32 isHeight8) const;
+
+    virtual UINT_32 HwlComputeHtileBaseAlign(
+        BOOL_32 isTcCompatible, BOOL_32 isLinear, ADDR_TILEINFO* pTileInfo) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pOut);
+
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskAddrFromCoord(
+        const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlComputeFmaskCoordFromAddr(
+        const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut) const;
+
+    virtual BOOL_32 HwlDegradeBaseLevel(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const;
+
+    virtual UINT_32 HwlComputeQbStereoRightSwizzle(
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pInfo) const;
+
+    virtual VOID HwlComputePixelCoordFromOffset(
+        UINT_32 offset, UINT_32 bpp, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 tileBase, UINT_32 compBits,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample,
+        AddrTileType microTileType, BOOL_32 isDepthSampleOrder) const;
+
+    /// Return Cmask block max
+    virtual BOOL_32 HwlGetMaxCmaskBlockMax() const
+    {
+        return 16383; // 14 bits
+    }
+
+    // Sub-hwl interface
+    /// Pure virtual function to setup tile info (indices) if client requests to do so
+    virtual VOID HwlSetupTileInfo(
+        AddrTileMode tileMode, ADDR_SURFACE_FLAGS flags,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        ADDR_TILEINFO* inputTileInfo, ADDR_TILEINFO* outputTileInfo,
+        AddrTileType inTileType, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const = 0;
+
+    /// Pure virtual function to get pitch alignment for linear modes
+    virtual UINT_32 HwlGetPitchAlignmentLinear(UINT_32 bpp, ADDR_SURFACE_FLAGS flags) const = 0;
+
+    /// Pure virtual function to get size adjustment for linear modes
+    virtual UINT_64 HwlGetSizeAdjustmentLinear(
+        AddrTileMode tileMode,
+        UINT_32 bpp, UINT_32 numSamples, UINT_32 baseAlign, UINT_32 pitchAlign,
+        UINT_32 *pPitch, UINT_32 *pHeight, UINT_32 *pHeightAlign) const = 0;
+
+    virtual UINT_32 HwlGetPitchAlignmentMicroTiled(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples) const;
+
+    virtual UINT_64 HwlGetSizeAdjustmentMicroTiled(
+        UINT_32 thickness, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32 baseAlign, UINT_32 pitchAlign,
+        UINT_32 *pPitch, UINT_32 *pHeight) const;
+
+        /// Pure virtual function to do extra sanity check
+    virtual BOOL_32 HwlSanityCheckMacroTiled(
+        ADDR_TILEINFO* pTileInfo) const = 0;
+
+    /// Pure virtual function to check current level to be the last macro tiled one
+    virtual VOID HwlCheckLastMacroTiledLvl(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const = 0;
+
+    /// Adjusts bank before bank is modified by rotation
+    virtual UINT_32 HwlPreAdjustBank(
+        UINT_32 tileX, UINT_32 bank, ADDR_TILEINFO*  pTileInfo) const = 0;
+
+    virtual VOID HwlComputeSurfaceCoord2DFromBankPipe(
+        AddrTileMode tileMode, UINT_32* pX, UINT_32* pY, UINT_32 slice,
+        UINT_32 bank, UINT_32 pipe,
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, UINT_32 tileSlices,
+        BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo) const = 0;
+
+    virtual BOOL_32 HwlTileInfoEqual(
+        const ADDR_TILEINFO* pLeft, const ADDR_TILEINFO* pRight) const;
+
+    virtual AddrTileMode HwlDegradeThickTileMode(
+        AddrTileMode baseTileMode, UINT_32 numSlices, UINT_32* pBytesPerTile) const;
+
+    virtual INT_32 HwlPostCheckTileIndex(
+        const ADDR_TILEINFO* pInfo, AddrTileMode mode, AddrTileType type,
+        INT curIndex = TileIndexInvalid) const
+    {
+        return TileIndexInvalid;
+    }
+
+    virtual VOID HwlFmaskPreThunkSurfInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pFmaskIn,
+        const ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut,
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pSurfIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut) const
+    {
+    }
+
+    virtual VOID HwlFmaskPostThunkSurfInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut) const
+    {
+    }
+
+    /// Virtual function to check if the height needs extra padding
+    /// for stereo right eye offset, to avoid bank pipe swizzle
+    virtual BOOL_32 HwlStereoCheckRightOffsetPadding() const
+    {
+        return FALSE;
+    }
+
+    virtual BOOL_32 HwlReduceBankWidthHeight(
+        UINT_32 tileSize, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32 bankHeightAlign, UINT_32 pipes,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    // Protected non-virtual functions
+
+    /// Mip level functions
+    AddrTileMode ComputeSurfaceMipLevelTileMode(
+        AddrTileMode baseTileMode, UINT_32 bpp,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSlices, UINT_32 numSamples,
+        UINT_32 pitchAlign, UINT_32 heightAlign,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    /// Swizzle functions
+    VOID ExtractBankPipeSwizzle(
+        UINT_32 base256b, ADDR_TILEINFO* pTileInfo,
+        UINT_32* pBankSwizzle, UINT_32* pPipeSwizzle) const;
+
+    UINT_32 GetBankPipeSwizzle(
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle,
+        UINT_64 baseAddr, ADDR_TILEINFO*  pTileInfo) const;
+
+    UINT_32 ComputeSliceTileSwizzle(
+        AddrTileMode tileMode, UINT_32 baseSwizzle, UINT_32 slice, UINT_64 baseAddr,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    /// Addressing functions
+    UINT_32 ComputeBankFromCoord(
+        UINT_32 x, UINT_32 y, UINT_32 slice,
+        AddrTileMode tileMode, UINT_32 bankSwizzle, UINT_32 tileSpitSlice,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    UINT_32 ComputeBankFromAddr(
+        UINT_64 addr, UINT_32 numBanks, UINT_32 numPipes) const;
+
+    UINT_32 ComputePipeRotation(
+        AddrTileMode tileMode, UINT_32 numPipes) const;
+
+    UINT_32 ComputeBankRotation(
+        AddrTileMode tileMode, UINT_32 numBanks,
+        UINT_32 numPipes) const;
+
+    VOID ComputeSurfaceCoord2DFromBankPipe(
+        AddrTileMode tileMode, UINT_32 x, UINT_32 y, UINT_32 slice,
+        UINT_32 bank, UINT_32 pipe,
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, UINT_32 tileSlices,
+        ADDR_TILEINFO* pTileInfo,
+        CoordFromBankPipe *pOutput) const;
+
+    /// Htile/Cmask functions
+    UINT_64 ComputeHtileBytes(
+        UINT_32 pitch, UINT_32 height, UINT_32 bpp,
+        BOOL_32 isLinear, UINT_32 numSlices, UINT_64* sliceBytes, UINT_32 baseAlign) const;
+
+    // Static functions
+    static BOOL_32 IsTileInfoAllZero(ADDR_TILEINFO* pTileInfo);
+    static UINT_32 ComputeFmaskNumPlanesFromNumSamples(UINT_32 numSamples);
+    static UINT_32 ComputeFmaskResolvedBppFromNumSamples(UINT_32 numSamples);
+
+private:
+
+    BOOL_32 ComputeSurfaceInfoLinear(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut,
+        UINT_32 padDims) const;
+
+    BOOL_32 ComputeSurfaceInfoMicroTiled(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut,
+        UINT_32 padDims,
+        AddrTileMode expTileMode) const;
+
+    BOOL_32 ComputeSurfaceInfoMacroTiled(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut,
+        UINT_32 padDims,
+        AddrTileMode expTileMode) const;
+
+    BOOL_32 ComputeSurfaceAlignmentsLinear(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32* pBaseAlign, UINT_32* pPitchAlign, UINT_32* pHeightAlign) const;
+
+    BOOL_32 ComputeSurfaceAlignmentsMicroTiled(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32* pBaseAlign, UINT_32* pPitchAlign, UINT_32* pHeightAlign) const;
+
+    BOOL_32 ComputeSurfaceAlignmentsMacroTiled(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags,
+        UINT_32 mipLevel, UINT_32 numSamples,
+        ADDR_TILEINFO* pTileInfo,
+        UINT_32* pBaseAlign, UINT_32* pPitchAlign, UINT_32* pHeightAlign) const;
+
+    /// Surface addressing functions
+    UINT_64 DispatchComputeSurfaceAddrFromCoord(
+        const ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    VOID    DispatchComputeSurfaceCoordFromAddr(
+        const ADDR_COMPUTE_SURFACE_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_COORDFROMADDR_OUTPUT* pOut) const;
+
+    UINT_64 ComputeSurfaceAddrFromCoordMicroTiled(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode,
+        AddrTileType microTileType, BOOL_32 isDepthSampleOrder,
+        UINT_32* pBitPosition) const;
+
+    UINT_64 ComputeSurfaceAddrFromCoordMacroTiled(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode,
+        AddrTileType microTileType, BOOL_32 ignoreSE, BOOL_32 isDepthSampleOrder,
+        UINT_32 pipeSwizzle, UINT_32 bankSwizzle,
+        ADDR_TILEINFO* pTileInfo,
+        UINT_32* pBitPosition) const;
+
+    VOID    ComputeSurfaceCoordFromAddrMacroTiled(
+        UINT_64 addr, UINT_32 bitPosition,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 tileBase, UINT_32 compBits,
+        AddrTileType microTileType, BOOL_32 ignoreSE, BOOL_32 isDepthSampleOrder,
+        UINT_32 pipeSwizzle, UINT_32 bankSwizzle,
+        ADDR_TILEINFO* pTileInfo,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample) const;
+
+    /// Fmask functions
+    UINT_64 DispatchComputeFmaskAddrFromCoord(
+        const ADDR_COMPUTE_FMASK_ADDRFROMCOORD_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_ADDRFROMCOORD_OUTPUT* pOut) const;
+
+    VOID    DispatchComputeFmaskCoordFromAddr(
+        const ADDR_COMPUTE_FMASK_COORDFROMADDR_INPUT* pIn,
+        ADDR_COMPUTE_FMASK_COORDFROMADDR_OUTPUT* pOut) const;
+
+    // FMASK related methods - private
+    UINT_64 ComputeFmaskAddrFromCoordMicroTiled(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample, UINT_32 plane,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSamples, AddrTileMode tileMode,
+        BOOL_32 resolved, UINT_32* pBitPosition) const;
+
+    VOID    ComputeFmaskCoordFromAddrMicroTiled(
+        UINT_64 addr, UINT_32 bitPosition,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode, BOOL_32 resolved,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample, UINT_32* pPlane) const;
+
+    VOID    ComputeFmaskCoordFromAddrMacroTiled(
+        UINT_64 addr, UINT_32 bitPosition,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSamples, AddrTileMode tileMode,
+        UINT_32 pipeSwizzle, UINT_32 bankSwizzle,
+        BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo,
+        BOOL_32 resolved,
+        UINT_32* pX, UINT_32* pY, UINT_32* pSlice, UINT_32* pSample, UINT_32* pPlane) const;
+
+    UINT_64 ComputeFmaskAddrFromCoordMacroTiled(
+        UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 sample, UINT_32 plane,
+        UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        AddrTileMode tileMode, UINT_32 pipeSwizzle, UINT_32 bankSwizzle,
+        BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo,
+        BOOL_32 resolved,
+        UINT_32* pBitPosition) const;
+
+    /// Sanity check functions
+    BOOL_32 SanityCheckMacroTiled(
+        ADDR_TILEINFO* pTileInfo) const;
+
+protected:
+    UINT_32 m_ranks;                ///< Number of ranks - MC_ARB_RAMCFG.NOOFRANK
+    UINT_32 m_logicalBanks;         ///< Logical banks = m_banks * m_ranks if m_banks != 16
+    UINT_32 m_bankInterleave;       ///< Bank interleave, as a multiple of pipe interleave size
+};
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.cpp
new file mode 100644
index 00000000000..a858b55b7cf
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.cpp
@@ -0,0 +1,2818 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  siaddrlib.cpp
+* @brief Contains the implementation for the SIAddrLib class.
+***************************************************************************************************
+*/
+
+#include "siaddrlib.h"
+
+#include "si_gb_reg.h"
+
+#include "si_ci_vi_merged_enum.h"
+
+#if BRAHMA_BUILD
+#include "amdgpu_id.h"
+#else
+#include "si_id.h"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+***************************************************************************************************
+*   AddrSIHwlInit
+*
+*   @brief
+*       Creates an SIAddrLib object.
+*
+*   @return
+*       Returns an SIAddrLib object pointer.
+***************************************************************************************************
+*/
+AddrLib* AddrSIHwlInit(const AddrClient* pClient)
+{
+    return SIAddrLib::CreateObj(pClient);
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::SIAddrLib
+*
+*   @brief
+*       Constructor
+*
+***************************************************************************************************
+*/
+SIAddrLib::SIAddrLib(const AddrClient* pClient) :
+    EgBasedAddrLib(pClient),
+    m_noOfEntries(0)
+{
+    m_class = SI_ADDRLIB;
+    memset(&m_settings, 0, sizeof(m_settings));
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::~SIAddrLib
+*
+*   @brief
+*       Destructor
+***************************************************************************************************
+*/
+SIAddrLib::~SIAddrLib()
+{
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetPipes
+*
+*   @brief
+*       Get number pipes
+*   @return
+*       num pipes
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlGetPipes(
+    const ADDR_TILEINFO* pTileInfo    ///< [in] Tile info
+    ) const
+{
+    UINT_32 numPipes;
+
+    if (pTileInfo)
+    {
+        numPipes = GetPipePerSurf(pTileInfo->pipeConfig);
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        numPipes = m_pipes; // Suppose we should still have a global pipes
+    }
+
+    return numPipes;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::GetPipePerSurf
+*   @brief
+*       get pipe num base on inputing tileinfo->pipeconfig
+*   @return
+*       pipe number
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::GetPipePerSurf(
+    AddrPipeCfg pipeConfig   ///< [in] pipe config
+    ) const
+{
+    UINT_32 numPipes = 0;
+
+    switch (pipeConfig)
+    {
+        case ADDR_PIPECFG_P2:
+            numPipes = 2;
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+        case ADDR_PIPECFG_P4_16x16:
+        case ADDR_PIPECFG_P4_16x32:
+        case ADDR_PIPECFG_P4_32x32:
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+        case ADDR_PIPECFG_P8_16x32_8x16:
+        case ADDR_PIPECFG_P8_32x32_8x16:
+        case ADDR_PIPECFG_P8_16x32_16x16:
+        case ADDR_PIPECFG_P8_32x32_16x16:
+        case ADDR_PIPECFG_P8_32x32_16x32:
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P16_32x32_8x16:
+        case ADDR_PIPECFG_P16_32x32_16x16:
+            numPipes = 16;
+            break;
+        default:
+            ADDR_ASSERT(!"Invalid pipe config");
+            numPipes = m_pipes;
+    }
+    return numPipes;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::ComputePipeFromCoord
+*
+*   @brief
+*       Compute pipe number from coordinates
+*   @return
+*       Pipe number
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::ComputePipeFromCoord(
+    UINT_32         x,              ///< [in] x coordinate
+    UINT_32         y,              ///< [in] y coordinate
+    UINT_32         slice,          ///< [in] slice index
+    AddrTileMode    tileMode,       ///< [in] tile mode
+    UINT_32         pipeSwizzle,    ///< [in] pipe swizzle
+    BOOL_32         ignoreSE,       ///< [in] TRUE if shader engines are ignored
+    ADDR_TILEINFO*  pTileInfo       ///< [in] Tile info
+    ) const
+{
+    UINT_32 pipe;
+    UINT_32 pipeBit0 = 0;
+    UINT_32 pipeBit1 = 0;
+    UINT_32 pipeBit2 = 0;
+    UINT_32 pipeBit3 = 0;
+    UINT_32 sliceRotation;
+    UINT_32 numPipes = 0;
+
+    UINT_32 tx = x / MicroTileWidth;
+    UINT_32 ty = y / MicroTileHeight;
+    UINT_32 x3 = _BIT(tx,0);
+    UINT_32 x4 = _BIT(tx,1);
+    UINT_32 x5 = _BIT(tx,2);
+    UINT_32 x6 = _BIT(tx,3);
+    UINT_32 y3 = _BIT(ty,0);
+    UINT_32 y4 = _BIT(ty,1);
+    UINT_32 y5 = _BIT(ty,2);
+    UINT_32 y6 = _BIT(ty,3);
+
+    switch (pTileInfo->pipeConfig)
+    {
+        case ADDR_PIPECFG_P2:
+            pipeBit0 = x3 ^ y3;
+            numPipes = 2;
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+            pipeBit0 = x4 ^ y3;
+            pipeBit1 = x3 ^ y4;
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P4_16x16:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y4;
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P4_16x32:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y5;
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P4_32x32:
+            pipeBit0 = x3 ^ y3 ^ x5;
+            pipeBit1 = x5 ^ y5;
+            numPipes = 4;
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+            pipeBit0 = x4 ^ y3 ^ x5;
+            pipeBit1 = x3 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_16x32_8x16:
+            pipeBit0 = x4 ^ y3 ^ x5;
+            pipeBit1 = x3 ^ y4;
+            pipeBit2 = x4 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_16x32_16x16:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x5 ^ y4;
+            pipeBit2 = x4 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_32x32_8x16:
+            pipeBit0 = x4 ^ y3 ^ x5;
+            pipeBit1 = x3 ^ y4;
+            pipeBit2 = x5 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x16:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y4;
+            pipeBit2 = x5 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x32:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y6;
+            pipeBit2 = x5 ^ y5;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            pipeBit0 = x3 ^ y3 ^ x5;
+            pipeBit1 = x6 ^ y5;
+            pipeBit2 = x5 ^ y6;
+            numPipes = 8;
+            break;
+        case ADDR_PIPECFG_P16_32x32_8x16:
+            pipeBit0 = x4 ^ y3;
+            pipeBit1 = x3 ^ y4;
+            pipeBit2 = x5 ^ y6;
+            pipeBit3 = x6 ^ y5;
+            numPipes = 16;
+            break;
+        case ADDR_PIPECFG_P16_32x32_16x16:
+            pipeBit0 = x3 ^ y3 ^ x4;
+            pipeBit1 = x4 ^ y4;
+            pipeBit2 = x5 ^ y6;
+            pipeBit3 = x6 ^ y5;
+            numPipes = 16;
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+    pipe = pipeBit0 | (pipeBit1 << 1) | (pipeBit2 << 2) | (pipeBit3 << 3);
+
+    UINT_32 microTileThickness = ComputeSurfaceThickness(tileMode);
+
+    //
+    // Apply pipe rotation for the slice.
+    //
+    switch (tileMode)
+    {
+        case ADDR_TM_3D_TILED_THIN1:    //fall through thin
+        case ADDR_TM_3D_TILED_THICK:    //fall through thick
+        case ADDR_TM_3D_TILED_XTHICK:
+            sliceRotation =
+                Max(1, static_cast<INT_32>(numPipes / 2) - 1) * (slice / microTileThickness);
+            break;
+        default:
+            sliceRotation = 0;
+            break;
+    }
+    pipeSwizzle += sliceRotation;
+    pipeSwizzle &= (numPipes - 1);
+
+    pipe = pipe ^ pipeSwizzle;
+
+    return pipe;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::ComputeTileCoordFromPipeAndElemIdx
+*
+*   @brief
+*       Compute (x,y) of a tile within a macro tile from address
+*   @return
+*       Pipe number
+***************************************************************************************************
+*/
+VOID SIAddrLib::ComputeTileCoordFromPipeAndElemIdx(
+    UINT_32         elemIdx,          ///< [in] per pipe element index within a macro tile
+    UINT_32         pipe,             ///< [in] pipe index
+    AddrPipeCfg     pipeCfg,          ///< [in] pipe config
+    UINT_32         pitchInMacroTile, ///< [in] surface pitch in macro tile
+    UINT_32         x,                ///< [in] x coordinate of the (0,0) tile in a macro tile
+    UINT_32         y,                ///< [in] y coordinate of the (0,0) tile in a macro tile
+    UINT_32*        pX,               ///< [out] x coordinate
+    UINT_32*        pY                ///< [out] y coordinate
+    ) const
+{
+    UINT_32 pipebit0 = _BIT(pipe,0);
+    UINT_32 pipebit1 = _BIT(pipe,1);
+    UINT_32 pipebit2 = _BIT(pipe,2);
+    UINT_32 pipebit3 = _BIT(pipe,3);
+    UINT_32 elemIdx0 = _BIT(elemIdx,0);
+    UINT_32 elemIdx1 = _BIT(elemIdx,1);
+    UINT_32 elemIdx2 = _BIT(elemIdx,2);
+    UINT_32 x3 = 0;
+    UINT_32 x4 = 0;
+    UINT_32 x5 = 0;
+    UINT_32 x6 = 0;
+    UINT_32 y3 = 0;
+    UINT_32 y4 = 0;
+    UINT_32 y5 = 0;
+    UINT_32 y6 = 0;
+
+    switch(pipeCfg)
+    {
+        case ADDR_PIPECFG_P2:
+            x4 = elemIdx2;
+            y4 = elemIdx1 ^ x4;
+            y3 = elemIdx0 ^ x4;
+            x3 = pipebit0 ^ y3;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+            x4 = elemIdx1;
+            y4 = elemIdx0 ^ x4;
+            x3 = pipebit1 ^ y4;
+            y3 = pipebit0 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P4_16x16:
+            x4 = elemIdx1;
+            y3 = elemIdx0 ^ x4;
+            y4 = pipebit1 ^ x4;
+            x3 = pipebit0 ^ y3 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P4_16x32:
+            x3 = elemIdx0 ^ pipebit0;
+            y5 = _BIT(y,5);
+            x4 = pipebit1 ^ y5;
+            y3 = pipebit0 ^ x3 ^ x4;
+            y4 = elemIdx1 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P4_32x32:
+            x4 = elemIdx2;
+            y3 = elemIdx0 ^ x4;
+            y4 = elemIdx1 ^ x4;
+            if((pitchInMacroTile % 2) == 0)
+            {   //even
+                y5 = _BIT(y,5);
+                x5 = pipebit1 ^ y5;
+                x3 = pipebit0 ^ y3 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            else
+            {   //odd
+                x5 = _BIT(x,5);
+                x3 = pipebit0 ^ y3 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(2, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+            x4 = elemIdx0;
+            y5 = _BIT(y,5);
+            x5 = _BIT(x,5);
+            x3 = pipebit1 ^ y5;
+            y4 = pipebit2 ^ x4;
+            y3 = pipebit0 ^ x5 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P8_16x32_8x16:
+            x3 = elemIdx0;
+            y4 = pipebit1 ^ x3;
+            y5 = _BIT(y,5);
+            x5 = _BIT(x,5);
+            x4 = pipebit2 ^ y5;
+            y3 = pipebit0 ^ x4 ^ x5;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P8_32x32_8x16:
+            x4 = elemIdx1;
+            y4 = elemIdx0 ^ x4;
+            x3 = pipebit1 ^ y4;
+            if((pitchInMacroTile % 2) == 0)
+            {  //even
+                y5 = _BIT(y,5);
+                x5 = _BIT(x,5);
+                x5 = pipebit2 ^ y5;
+                y3 = pipebit0 ^ x4 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            else
+            {  //odd
+                x5 = _BIT(x,5);
+                y3 = pipebit0 ^ x4 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(2, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P8_16x32_16x16:
+            x3 = elemIdx0;
+            x5 = _BIT(x,5);
+            y5 = _BIT(y,5);
+            x4 = pipebit2 ^ y5;
+            y4 = pipebit1 ^ x5;
+            y3 = pipebit0 ^ x3 ^ x4;
+            *pY = Bits2Number(2, y4, y3);
+            *pX = Bits2Number(2, x4, x3);
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x16:
+            x4 = elemIdx1;
+            y3 = elemIdx0 ^ x4;
+            x3 = y3^x4^pipebit0;
+            y4 = pipebit1 ^ x4;
+            if((pitchInMacroTile % 2) == 0)
+            {   //even
+                y5 = _BIT(y,5);
+                x5 = pipebit2 ^ y5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            else
+            {   //odd
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(2, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x32:
+            if((pitchInMacroTile % 2) == 0)
+            {   //even
+                y5 = _BIT(y,5);
+                y6 = _BIT(y,6);
+                x4 = pipebit1 ^ y6;
+                y3 = elemIdx0 ^ x4;
+                y4 = elemIdx1 ^ x4;
+                x3 = pipebit0 ^ y3 ^ x4;
+                x5 = pipebit2 ^ y5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            else
+            {   //odd
+                y6 = _BIT(y,6);
+                x4 = pipebit1 ^ y6;
+                y3 = elemIdx0 ^ x4;
+                y4 = elemIdx1 ^ x4;
+                x3 = pipebit0 ^ y3 ^ x4;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(2, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            x4 = elemIdx2;
+            y3 = elemIdx0 ^ x4;
+            y4 = elemIdx1 ^ x4;
+            if((pitchInMacroTile % 4) == 0)
+            {   //multiple of 4
+                y5 = _BIT(y,5);
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                x6 = pipebit1 ^ y5;
+                x3 = pipebit0 ^ y3 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(4, x6, x5, x4, x3);
+            }
+            else
+            {
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                x3 = pipebit0 ^ y3 ^ x5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P16_32x32_8x16:
+            x4 = elemIdx1;
+            y4 = elemIdx0 ^ x4;
+            y3 = pipebit0 ^ x4;
+            x3 = pipebit1 ^ y4;
+            if((pitchInMacroTile % 4) == 0)
+            {   //multiple of 4
+                y5 = _BIT(y,5);
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                x6 = pipebit3 ^ y5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(4, x6, x5,x4, x3);
+            }
+            else
+            {
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            break;
+        case ADDR_PIPECFG_P16_32x32_16x16:
+            x4 = elemIdx1;
+            y3 = elemIdx0 ^ x4;
+            y4 = pipebit1 ^ x4;
+            x3 = pipebit0 ^ y3 ^ x4;
+            if((pitchInMacroTile % 4) == 0)
+            {   //multiple of 4
+                y5 = _BIT(y,5);
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                x6 = pipebit3 ^ y5;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(4, x6, x5, x4, x3);
+            }
+            else
+            {
+                y6 = _BIT(y,6);
+                x5 = pipebit2 ^ y6;
+                *pY = Bits2Number(2, y4, y3);
+                *pX = Bits2Number(3, x5, x4, x3);
+            }
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+    }
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::TileCoordToMaskElementIndex
+*
+*   @brief
+*       Compute element index from coordinates in tiles
+*   @return
+*       Element index
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::TileCoordToMaskElementIndex(
+    UINT_32         tx,                 ///< [in] x coord, in Tiles
+    UINT_32         ty,                 ///< [in] y coord, in Tiles
+    AddrPipeCfg     pipeConfig,         ///< [in] pipe config
+    UINT_32*        macroShift,         ///< [out] macro shift
+    UINT_32*        elemIdxBits         ///< [out] tile offset bits
+    ) const
+{
+    UINT_32 elemIdx = 0;
+    UINT_32 elemIdx0, elemIdx1, elemIdx2;
+    UINT_32 tx0, tx1;
+    UINT_32 ty0, ty1;
+
+    tx0 = _BIT(tx,0);
+    tx1 = _BIT(tx,1);
+    ty0 = _BIT(ty,0);
+    ty1 = _BIT(ty,1);
+
+    switch(pipeConfig)
+    {
+        case ADDR_PIPECFG_P2:
+            *macroShift = 3;
+            *elemIdxBits =3;
+            elemIdx2 = tx1;
+            elemIdx1 = tx1 ^ ty1;
+            elemIdx0 = tx1 ^ ty0;
+            elemIdx = Bits2Number(3,elemIdx2,elemIdx1,elemIdx0);
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+            *macroShift = 2;
+            *elemIdxBits =2;
+            elemIdx1 = tx1;
+            elemIdx0 = tx1 ^ ty1;
+            elemIdx = Bits2Number(2,elemIdx1,elemIdx0);
+            break;
+        case ADDR_PIPECFG_P4_16x16:
+            *macroShift = 2;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P4_16x32:
+            *macroShift = 2;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1^ty1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P4_32x32:
+            *macroShift = 2;
+            *elemIdxBits =3;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1^ty1;
+            elemIdx2 = tx1;
+            elemIdx = Bits2Number(3, elemIdx2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+            *macroShift = 1;
+            *elemIdxBits =1;
+            elemIdx0 = tx1;
+            elemIdx = elemIdx0;
+            break;
+        case ADDR_PIPECFG_P8_16x32_8x16:
+            *macroShift = 1;
+            *elemIdxBits =1;
+            elemIdx0 = tx0;
+            elemIdx = elemIdx0;
+            break;
+        case ADDR_PIPECFG_P8_32x32_8x16:
+            *macroShift = 1;
+            *elemIdxBits =2;
+            elemIdx1 = tx1;
+            elemIdx0 = tx1^ty1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P8_16x32_16x16:
+            *macroShift = 1;
+            *elemIdxBits =1;
+            elemIdx0 = tx0;
+            elemIdx = elemIdx0;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x16:
+            *macroShift = 1;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x32:
+            *macroShift = 1;
+            *elemIdxBits =2;
+            elemIdx0 =  tx1^ty0;
+            elemIdx1 = tx1^ty1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            *macroShift = 1;
+            *elemIdxBits =3;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1^ty1;
+            elemIdx2 = tx1;
+            elemIdx = Bits2Number(3, elemIdx2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P16_32x32_8x16:
+            *macroShift = 0;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty1;
+            elemIdx1 = tx1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        case ADDR_PIPECFG_P16_32x32_16x16:
+            *macroShift = 0;
+            *elemIdxBits =2;
+            elemIdx0 = tx1^ty0;
+            elemIdx1 = tx1;
+            elemIdx = Bits2Number(2, elemIdx1, elemIdx0);
+            break;
+        default:
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    return elemIdx;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeTileDataWidthAndHeightLinear
+*
+*   @brief
+*       Compute the squared cache shape for per-tile data (CMASK and HTILE) for linear layout
+*
+*   @return
+*       N/A
+*
+*   @note
+*       MacroWidth and macroHeight are measured in pixels
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlComputeTileDataWidthAndHeightLinear(
+    UINT_32*        pMacroWidth,     ///< [out] macro tile width
+    UINT_32*        pMacroHeight,    ///< [out] macro tile height
+    UINT_32         bpp,             ///< [in] bits per pixel
+    ADDR_TILEINFO*  pTileInfo        ///< [in] tile info
+    ) const
+{
+    ADDR_ASSERT(pTileInfo != NULL);
+    UINT_32 macroWidth;
+    UINT_32 macroHeight;
+
+    /// In linear mode, the htile or cmask buffer must be padded out to 4 tiles
+    /// but for P8_32x64_32x32, it must be padded out to 8 tiles
+    /// Actually there are more pipe configs which need 8-tile padding but SI family
+    /// has a bug which is fixed in CI family
+    if ((pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x64_32x32) ||
+        (pTileInfo->pipeConfig == ADDR_PIPECFG_P16_32x32_8x16) ||
+        (pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x32_16x16))
+    {
+        macroWidth  = 8*MicroTileWidth;
+        macroHeight = 8*MicroTileHeight;
+    }
+    else
+    {
+        macroWidth  = 4*MicroTileWidth;
+        macroHeight = 4*MicroTileHeight;
+    }
+
+    *pMacroWidth    = macroWidth;
+    *pMacroHeight   = macroHeight;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeHtileBytes
+*
+*   @brief
+*       Compute htile size in bytes
+*
+*   @return
+*       Htile size in bytes
+***************************************************************************************************
+*/
+UINT_64 SIAddrLib::HwlComputeHtileBytes(
+    UINT_32     pitch,          ///< [in] pitch
+    UINT_32     height,         ///< [in] height
+    UINT_32     bpp,            ///< [in] bits per pixel
+    BOOL_32     isLinear,       ///< [in] if it is linear mode
+    UINT_32     numSlices,      ///< [in] number of slices
+    UINT_64*    pSliceBytes,    ///< [out] bytes per slice
+    UINT_32     baseAlign       ///< [in] base alignments
+    ) const
+{
+    return ComputeHtileBytes(pitch, height, bpp, isLinear, numSlices, pSliceBytes, baseAlign);
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeXmaskAddrFromCoord
+*
+*   @brief
+*       Compute address from coordinates for htile/cmask
+*   @return
+*       Byte address
+***************************************************************************************************
+*/
+UINT_64 SIAddrLib::HwlComputeXmaskAddrFromCoord(
+    UINT_32        pitch,          ///< [in] pitch
+    UINT_32        height,         ///< [in] height
+    UINT_32        x,              ///< [in] x coord
+    UINT_32        y,              ///< [in] y coord
+    UINT_32        slice,          ///< [in] slice/depth index
+    UINT_32        numSlices,      ///< [in] number of slices
+    UINT_32        factor,         ///< [in] factor that indicates cmask(2) or htile(1)
+    BOOL_32        isLinear,       ///< [in] linear or tiled HTILE layout
+    BOOL_32        isWidth8,       ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    BOOL_32        isHeight8,      ///< [in] TRUE if width is 8, FALSE means 4. It's register value
+    ADDR_TILEINFO* pTileInfo,      ///< [in] Tile info
+    UINT_32*       pBitPosition    ///< [out] bit position inside a byte
+    ) const
+{
+    UINT_32 tx = x / MicroTileWidth;
+    UINT_32 ty = y / MicroTileHeight;
+    UINT_32 newPitch;
+    UINT_32 newHeight;
+    UINT_64 totalBytes;
+    UINT_32 macroWidth;
+    UINT_32 macroHeight;
+    UINT_64 pSliceBytes;
+    UINT_32 pBaseAlign;
+    UINT_32 tileNumPerPipe;
+    UINT_32 elemBits;
+
+    if (factor == 2) //CMASK
+    {
+        ADDR_CMASK_FLAGS flags = {{0}};
+
+        tileNumPerPipe = 256;
+
+        ComputeCmaskInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &macroWidth,
+                         &macroHeight);
+        elemBits = CmaskElemBits;
+    }
+    else //HTile
+    {
+        ADDR_HTILE_FLAGS flags = {{0}};
+
+        tileNumPerPipe = 512;
+
+        ComputeHtileInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         TRUE,
+                         TRUE,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &macroWidth,
+                         &macroHeight,
+                         &pSliceBytes,
+                         &pBaseAlign);
+        elemBits = 32;
+    }
+
+    const UINT_32 pitchInTile = newPitch / MicroTileWidth;
+    const UINT_32 heightInTile = newHeight / MicroTileWidth;
+    UINT_64 macroOffset; // Per pipe starting offset of the macro tile in which this tile lies.
+    UINT_64 microNumber; // Per pipe starting offset of the macro tile in which this tile lies.
+    UINT_32 microX;
+    UINT_32 microY;
+    UINT_64 microOffset;
+    UINT_32 microShift;
+    UINT_64 totalOffset;
+    UINT_32 elemIdxBits;
+    UINT_32 elemIdx =
+        TileCoordToMaskElementIndex(tx, ty, pTileInfo->pipeConfig, &microShift, &elemIdxBits);
+
+    UINT_32 numPipes = HwlGetPipes(pTileInfo);
+
+    if (isLinear)
+    {   //linear addressing
+        // Linear addressing is extremelly wasting memory if slice > 1, since each pipe has the full
+        // slice memory foot print instead of divided by numPipes.
+        microX = tx / 4; // Macro Tile is 4x4
+        microY = ty / 4 ;
+        microNumber = static_cast<UINT_64>(microX + microY * (pitchInTile / 4)) << microShift;
+
+        UINT_32 sliceBits = pitchInTile * heightInTile;
+
+        // do htile single slice alignment if the flag is true
+        if (m_configFlags.useHtileSliceAlign && (factor == 1))  //Htile
+        {
+            sliceBits = PowTwoAlign(sliceBits, BITS_TO_BYTES(HtileCacheBits) * numPipes / elemBits);
+        }
+        macroOffset = slice * (sliceBits / numPipes) * elemBits ;
+    }
+    else
+    {   //tiled addressing
+        const UINT_32 macroWidthInTile = macroWidth / MicroTileWidth; // Now in unit of Tiles
+        const UINT_32 macroHeightInTile = macroHeight / MicroTileHeight;
+        const UINT_32 pitchInCL = pitchInTile / macroWidthInTile;
+        const UINT_32 heightInCL = heightInTile / macroHeightInTile;
+
+        const UINT_32 macroX = x / macroWidth;
+        const UINT_32 macroY = y / macroHeight;
+        const UINT_32 macroNumber = macroX + macroY * pitchInCL + slice * pitchInCL * heightInCL;
+
+        // Per pipe starting offset of the cache line in which this tile lies.
+        microX = (x % macroWidth) / MicroTileWidth / 4; // Macro Tile is 4x4
+        microY = (y % macroHeight) / MicroTileHeight / 4 ;
+        microNumber = static_cast<UINT_64>(microX + microY * (macroWidth / MicroTileWidth / 4)) << microShift;
+
+        macroOffset = macroNumber * tileNumPerPipe * elemBits;
+    }
+
+    if(elemIdxBits == microShift)
+    {
+        microNumber += elemIdx;
+    }
+    else
+    {
+        microNumber >>= elemIdxBits;
+        microNumber <<= elemIdxBits;
+        microNumber += elemIdx;
+    }
+
+    microOffset = elemBits * microNumber;
+    totalOffset = microOffset + macroOffset;
+
+    UINT_32 pipe = ComputePipeFromCoord(x, y, 0, ADDR_TM_2D_TILED_THIN1, 0, FALSE, pTileInfo);
+    UINT_64 addrInBits = totalOffset % (m_pipeInterleaveBytes * 8) +
+                   pipe * (m_pipeInterleaveBytes * 8) +
+                   totalOffset / (m_pipeInterleaveBytes * 8) * (m_pipeInterleaveBytes * 8) * numPipes;
+    *pBitPosition = static_cast<UINT_32>(addrInBits) % 8;
+    UINT_64 addr = addrInBits / 8;
+
+    return addr;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeXmaskCoordFromAddr
+*
+*   @brief
+*       Compute the coord from an address of a cmask/htile
+*
+*   @return
+*       N/A
+*
+*   @note
+*       This method is reused by htile, so rename to Xmask
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlComputeXmaskCoordFromAddr(
+    UINT_64         addr,           ///< [in] address
+    UINT_32         bitPosition,    ///< [in] bitPosition in a byte
+    UINT_32         pitch,          ///< [in] pitch
+    UINT_32         height,         ///< [in] height
+    UINT_32         numSlices,      ///< [in] number of slices
+    UINT_32         factor,         ///< [in] factor that indicates cmask or htile
+    BOOL_32         isLinear,       ///< [in] linear or tiled HTILE layout
+    BOOL_32         isWidth8,       ///< [in] Not used by SI
+    BOOL_32         isHeight8,      ///< [in] Not used by SI
+    ADDR_TILEINFO*  pTileInfo,      ///< [in] Tile info
+    UINT_32*        pX,             ///< [out] x coord
+    UINT_32*        pY,             ///< [out] y coord
+    UINT_32*        pSlice          ///< [out] slice index
+    ) const
+{
+    UINT_32 newPitch;
+    UINT_32 newHeight;
+    UINT_64 totalBytes;
+    UINT_32 clWidth;
+    UINT_32 clHeight;
+    UINT_32 tileNumPerPipe;
+    UINT_64 sliceBytes;
+
+    *pX = 0;
+    *pY = 0;
+    *pSlice = 0;
+
+    if (factor == 2) //CMASK
+    {
+        ADDR_CMASK_FLAGS flags = {{0}};
+
+        tileNumPerPipe = 256;
+
+        ComputeCmaskInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &clWidth,
+                         &clHeight);
+    }
+    else //HTile
+    {
+        ADDR_HTILE_FLAGS flags = {{0}};
+
+        tileNumPerPipe = 512;
+
+        ComputeHtileInfo(flags,
+                         pitch,
+                         height,
+                         numSlices,
+                         isLinear,
+                         TRUE,
+                         TRUE,
+                         pTileInfo,
+                         &newPitch,
+                         &newHeight,
+                         &totalBytes,
+                         &clWidth,
+                         &clHeight,
+                         &sliceBytes);
+    }
+
+    const UINT_32 pitchInTile = newPitch / MicroTileWidth;
+    const UINT_32 heightInTile = newHeight / MicroTileWidth;
+    const UINT_32 pitchInMacroTile = pitchInTile / 4;
+    UINT_32 macroShift;
+    UINT_32 elemIdxBits;
+    // get macroShift and elemIdxBits
+    TileCoordToMaskElementIndex(0, 0, pTileInfo->pipeConfig, &macroShift, &elemIdxBits);
+
+    const UINT_32 numPipes = HwlGetPipes(pTileInfo);
+    const UINT_32 pipe = (UINT_32)((addr / m_pipeInterleaveBytes) % numPipes);
+    // per pipe
+    UINT_64 localOffset = (addr % m_pipeInterleaveBytes) +
+        (addr / m_pipeInterleaveBytes / numPipes)* m_pipeInterleaveBytes;
+
+    UINT_32 tileIndex;
+    if (factor == 2) //CMASK
+    {
+        tileIndex = (UINT_32)(localOffset * 2 + (bitPosition != 0));
+    }
+    else
+    {
+        tileIndex = (UINT_32)(localOffset / 4);
+    }
+
+    UINT_32 macroOffset;
+    if (isLinear)
+    {
+        UINT_32 sliceSizeInTile = pitchInTile * heightInTile;
+
+        // do htile single slice alignment if the flag is true
+        if (m_configFlags.useHtileSliceAlign && (factor == 1))  //Htile
+        {
+            sliceSizeInTile = PowTwoAlign(sliceSizeInTile, static_cast<UINT_32>(sliceBytes) / 64);
+        }
+        *pSlice = tileIndex / (sliceSizeInTile / numPipes);
+        macroOffset = tileIndex % (sliceSizeInTile / numPipes);
+    }
+    else
+    {
+        const UINT_32 clWidthInTile = clWidth / MicroTileWidth; // Now in unit of Tiles
+        const UINT_32 clHeightInTile = clHeight / MicroTileHeight;
+        const UINT_32 pitchInCL = pitchInTile / clWidthInTile;
+        const UINT_32 heightInCL = heightInTile / clHeightInTile;
+        const UINT_32 clIndex = tileIndex / tileNumPerPipe;
+
+        UINT_32 clX = clIndex % pitchInCL;
+        UINT_32 clY = (clIndex % (heightInCL * pitchInCL)) / pitchInCL;
+
+        *pX = clX * clWidthInTile * MicroTileWidth;
+        *pY = clY * clHeightInTile * MicroTileHeight;
+        *pSlice = clIndex / (heightInCL * pitchInCL);
+
+        macroOffset = tileIndex % tileNumPerPipe;
+    }
+
+    UINT_32 elemIdx = macroOffset & 7;
+    macroOffset >>= elemIdxBits;
+
+    if (elemIdxBits != macroShift)
+    {
+        macroOffset <<= (elemIdxBits - macroShift);
+
+        UINT_32 pipebit1 = _BIT(pipe,1);
+        UINT_32 pipebit2 = _BIT(pipe,2);
+        UINT_32 pipebit3 = _BIT(pipe,3);
+        if (pitchInMacroTile % 2)
+        {   //odd
+            switch (pTileInfo->pipeConfig)
+            {
+                case ADDR_PIPECFG_P4_32x32:
+                    macroOffset |= pipebit1;
+                    break;
+                case ADDR_PIPECFG_P8_32x32_8x16:
+                case ADDR_PIPECFG_P8_32x32_16x16:
+                case ADDR_PIPECFG_P8_32x32_16x32:
+                    macroOffset |= pipebit2;
+                    break;
+                default:
+                    break;
+            }
+
+        }
+
+        if (pitchInMacroTile % 4)
+        {
+            if (pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x64_32x32)
+            {
+                macroOffset |= (pipebit1<<1);
+            }
+            if((pTileInfo->pipeConfig == ADDR_PIPECFG_P16_32x32_8x16) ||
+               (pTileInfo->pipeConfig == ADDR_PIPECFG_P16_32x32_16x16))
+            {
+                macroOffset |= (pipebit3<<1);
+            }
+        }
+    }
+
+    UINT_32 macroX;
+    UINT_32 macroY;
+
+    if (isLinear)
+    {
+        macroX = macroOffset % pitchInMacroTile;
+        macroY = macroOffset / pitchInMacroTile;
+    }
+    else
+    {
+        const UINT_32 clWidthInMacroTile = clWidth / (MicroTileWidth * 4);
+        macroX = macroOffset % clWidthInMacroTile;
+        macroY = macroOffset / clWidthInMacroTile;
+    }
+
+    *pX += macroX * 4 * MicroTileWidth;
+    *pY += macroY * 4 * MicroTileHeight;
+
+    UINT_32 microX;
+    UINT_32 microY;
+    ComputeTileCoordFromPipeAndElemIdx(elemIdx, pipe, pTileInfo->pipeConfig, pitchInMacroTile,
+                                       *pX, *pY, &microX, &microY);
+
+    *pX += microX * MicroTileWidth;
+    *pY += microY * MicroTileWidth;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetPitchAlignmentLinear
+*   @brief
+*       Get pitch alignment
+*   @return
+*       pitch alignment
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlGetPitchAlignmentLinear(
+    UINT_32             bpp,    ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags   ///< [in] surface flags
+    ) const
+{
+    UINT_32 pitchAlign;
+
+    // Interleaved access requires a 256B aligned pitch, so fall back to pre-SI alignment
+    if (flags.interleaved)
+    {
+        pitchAlign = Max(64u, m_pipeInterleaveBytes / BITS_TO_BYTES(bpp));
+
+    }
+    else
+    {
+        pitchAlign = Max(8u, 64 / BITS_TO_BYTES(bpp));
+    }
+
+    return pitchAlign;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetSizeAdjustmentLinear
+*
+*   @brief
+*       Adjust linear surface pitch and slice size
+*
+*   @return
+*       Logical slice size in bytes
+***************************************************************************************************
+*/
+UINT_64 SIAddrLib::HwlGetSizeAdjustmentLinear(
+    AddrTileMode        tileMode,       ///< [in] tile mode
+    UINT_32             bpp,            ///< [in] bits per pixel
+    UINT_32             numSamples,     ///< [in] number of samples
+    UINT_32             baseAlign,      ///< [in] base alignment
+    UINT_32             pitchAlign,     ///< [in] pitch alignment
+    UINT_32*            pPitch,         ///< [in/out] pointer to pitch
+    UINT_32*            pHeight,        ///< [in/out] pointer to height
+    UINT_32*            pHeightAlign    ///< [in/out] pointer to height align
+    ) const
+{
+    UINT_64 sliceSize;
+    if (tileMode == ADDR_TM_LINEAR_GENERAL)
+    {
+        sliceSize = BITS_TO_BYTES(static_cast<UINT_64>(*pPitch) * (*pHeight) * bpp * numSamples);
+    }
+    else
+    {
+        UINT_32 pitch   = *pPitch;
+        UINT_32 height  = *pHeight;
+
+        UINT_32 pixelsPerPipeInterleave = m_pipeInterleaveBytes / BITS_TO_BYTES(bpp);
+        UINT_32 sliceAlignInPixel = pixelsPerPipeInterleave < 64 ? 64 : pixelsPerPipeInterleave;
+
+        // numSamples should be 1 in real cases (no MSAA for linear but TGL may pass non 1 value)
+        UINT_64 pixelPerSlice = static_cast<UINT_64>(pitch) * height * numSamples;
+
+        while (pixelPerSlice % sliceAlignInPixel)
+        {
+            pitch += pitchAlign;
+            pixelPerSlice = static_cast<UINT_64>(pitch) * height * numSamples;
+        }
+
+        *pPitch = pitch;
+
+        UINT_32 heightAlign = 1;
+
+        while ((pitch * heightAlign) % sliceAlignInPixel)
+        {
+            heightAlign++;
+        }
+
+        *pHeightAlign = heightAlign;
+
+        sliceSize = BITS_TO_BYTES(pixelPerSlice * bpp);
+    }
+
+    return sliceSize;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlPreHandleBaseLvl3xPitch
+*
+*   @brief
+*       Pre-handler of 3x pitch (96 bit) adjustment
+*
+*   @return
+*       Expected pitch
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlPreHandleBaseLvl3xPitch(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] input
+    UINT_32                                 expPitch    ///< [in] pitch
+    ) const
+{
+    ADDR_ASSERT(pIn->width == expPitch);
+
+    // From SI, if pow2Pad is 1 the pitch is expanded 3x first, then padded to pow2, so nothing to
+    // do here
+    if (!pIn->flags.pow2Pad)
+    {
+        AddrLib::HwlPreHandleBaseLvl3xPitch(pIn, expPitch);
+    }
+    else
+    {
+        ADDR_ASSERT(IsPow2(expPitch));
+    }
+
+    return expPitch;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlPostHandleBaseLvl3xPitch
+*
+*   @brief
+*       Post-handler of 3x pitch adjustment
+*
+*   @return
+*       Expected pitch
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlPostHandleBaseLvl3xPitch(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,        ///< [in] input
+    UINT_32                                 expPitch    ///< [in] pitch
+    ) const
+{
+    /**
+     * @note The pitch will be divided by 3 in the end so the value will look odd but h/w should
+     *  be able to compute a correct pitch from it as h/w address library is doing the job.
+     */
+    // From SI, the pitch is expanded 3x first, then padded to pow2, so no special handler here
+    if (!pIn->flags.pow2Pad)
+    {
+        AddrLib::HwlPostHandleBaseLvl3xPitch(pIn, expPitch);
+    }
+
+    return expPitch;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetPitchAlignmentMicroTiled
+*
+*   @brief
+*       Compute 1D tiled surface pitch alignment
+*
+*   @return
+*       pitch alignment
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlGetPitchAlignmentMicroTiled(
+    AddrTileMode        tileMode,          ///< [in] tile mode
+    UINT_32             bpp,               ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,             ///< [in] surface flags
+    UINT_32             numSamples         ///< [in] number of samples
+    ) const
+{
+    UINT_32 pitchAlign;
+
+    if (flags.qbStereo)
+    {
+        pitchAlign = EgBasedAddrLib::HwlGetPitchAlignmentMicroTiled(tileMode,bpp,flags,numSamples);
+    }
+    else
+    {
+        pitchAlign = 8;
+    }
+
+    return pitchAlign;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetSizeAdjustmentMicroTiled
+*
+*   @brief
+*       Adjust 1D tiled surface pitch and slice size
+*
+*   @return
+*       Logical slice size in bytes
+***************************************************************************************************
+*/
+UINT_64 SIAddrLib::HwlGetSizeAdjustmentMicroTiled(
+    UINT_32             thickness,      ///< [in] thickness
+    UINT_32             bpp,            ///< [in] bits per pixel
+    ADDR_SURFACE_FLAGS  flags,          ///< [in] surface flags
+    UINT_32             numSamples,     ///< [in] number of samples
+    UINT_32             baseAlign,      ///< [in] base alignment
+    UINT_32             pitchAlign,     ///< [in] pitch alignment
+    UINT_32*            pPitch,         ///< [in/out] pointer to pitch
+    UINT_32*            pHeight         ///< [in/out] pointer to height
+    ) const
+{
+    UINT_64 logicalSliceSize;
+    UINT_64 physicalSliceSize;
+
+    UINT_32 pitch   = *pPitch;
+    UINT_32 height  = *pHeight;
+
+    // Logical slice: pitch * height * bpp * numSamples (no 1D MSAA so actually numSamples == 1)
+    logicalSliceSize = BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * bpp * numSamples);
+
+    // Physical slice: multiplied by thickness
+    physicalSliceSize =  logicalSliceSize * thickness;
+
+    // Pitch alignment is always 8, so if slice size is not padded to base alignment
+    // (pipe_interleave_size), we need to increase pitch
+    while ((physicalSliceSize % baseAlign) != 0)
+    {
+        pitch += pitchAlign;
+
+        logicalSliceSize = BITS_TO_BYTES(static_cast<UINT_64>(pitch) * height * bpp * numSamples);
+
+        physicalSliceSize =  logicalSliceSize * thickness;
+    }
+
+#if !ALT_TEST
+    //
+    // Special workaround for depth/stencil buffer, use 8 bpp to align depth buffer again since
+    // the stencil plane may have larger pitch if the slice size is smaller than base alignment.
+    //
+    // Note: this actually does not work for mipmap but mipmap depth texture is not really
+    // sampled with mipmap.
+    //
+    if (flags.depth && !flags.noStencil)
+    {
+        ADDR_ASSERT(numSamples == 1);
+
+        UINT_64 logicalSiceSizeStencil = static_cast<UINT_64>(pitch) * height; // 1 byte stencil
+
+        while ((logicalSiceSizeStencil % baseAlign) != 0)
+        {
+            pitch += pitchAlign; // Stencil plane's pitch alignment is the same as depth plane's
+
+            logicalSiceSizeStencil = static_cast<UINT_64>(pitch) * height;
+        }
+
+        if (pitch != *pPitch)
+        {
+            // If this is a mipmap, this padded one cannot be sampled as a whole mipmap!
+            logicalSliceSize = logicalSiceSizeStencil * BITS_TO_BYTES(bpp);
+        }
+    }
+#endif
+    *pPitch = pitch;
+
+    // No adjust for pHeight
+
+    return logicalSliceSize;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlConvertChipFamily
+*
+*   @brief
+*       Convert familyID defined in atiid.h to AddrChipFamily and set m_chipFamily/m_chipRevision
+*   @return
+*       AddrChipFamily
+***************************************************************************************************
+*/
+AddrChipFamily SIAddrLib::HwlConvertChipFamily(
+    UINT_32 uChipFamily,        ///< [in] chip family defined in atiih.h
+    UINT_32 uChipRevision)      ///< [in] chip revision defined in "asic_family"_id.h
+{
+    AddrChipFamily family = ADDR_CHIP_FAMILY_SI;
+
+    switch (uChipFamily)
+    {
+        case FAMILY_SI:
+            m_settings.isSouthernIsland = 1;
+            m_settings.isTahiti     = ASICREV_IS_TAHITI_P(uChipRevision);
+            m_settings.isPitCairn   = ASICREV_IS_PITCAIRN_PM(uChipRevision);
+            m_settings.isCapeVerde  = ASICREV_IS_CAPEVERDE_M(uChipRevision);
+            m_settings.isOland      = ASICREV_IS_OLAND_M(uChipRevision);
+            m_settings.isHainan     = ASICREV_IS_HAINAN_V(uChipRevision);
+            break;
+        default:
+            ADDR_ASSERT(!"This should be a Fusion");
+            break;
+    }
+
+    return family;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlSetupTileInfo
+*
+*   @brief
+*       Setup default value of tile info for SI
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlSetupTileInfo(
+    AddrTileMode                        tileMode,       ///< [in] Tile mode
+    ADDR_SURFACE_FLAGS                  flags,          ///< [in] Surface type flags
+    UINT_32                             bpp,            ///< [in] Bits per pixel
+    UINT_32                             pitch,          ///< [in] Pitch in pixels
+    UINT_32                             height,         ///< [in] Height in pixels
+    UINT_32                             numSamples,     ///< [in] Number of samples
+    ADDR_TILEINFO*                      pTileInfoIn,    ///< [in] Tile info input: NULL for default
+    ADDR_TILEINFO*                      pTileInfoOut,   ///< [out] Tile info output
+    AddrTileType                        inTileType,     ///< [in] Tile type
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*   pOut            ///< [out] Output
+    ) const
+{
+    UINT_32 thickness = ComputeSurfaceThickness(tileMode);
+    ADDR_TILEINFO* pTileInfo = pTileInfoOut;
+    INT index = TileIndexInvalid;
+
+    // Fail-safe code
+    if (!IsLinear(tileMode))
+    {
+        // 128 bpp/thick tiling must be non-displayable.
+        // Fmask reuse color buffer's entry but bank-height field can be from another entry
+        // To simplify the logic, fmask entry should be picked from non-displayable ones
+        if (bpp == 128 || thickness > 1 || flags.fmask || flags.prt)
+        {
+            inTileType = ADDR_NON_DISPLAYABLE;
+        }
+
+        if (flags.depth || flags.stencil)
+        {
+            inTileType = ADDR_DEPTH_SAMPLE_ORDER;
+        }
+    }
+
+    // Partial valid fields are not allowed for SI.
+    if (IsTileInfoAllZero(pTileInfo))
+    {
+        if (IsMacroTiled(tileMode))
+        {
+            if (flags.prt)
+            {
+                if (numSamples == 1)
+                {
+                    if (flags.depth)
+                    {
+                        switch (bpp)
+                        {
+                            case 16:
+                                index = 3;
+                                break;
+                            case 32:
+                                index = 6;
+                                break;
+                            default:
+                                ADDR_ASSERT_ALWAYS();
+                                break;
+                        }
+                    }
+                    else
+                    {
+                        switch (bpp)
+                        {
+                            case 8:
+                                index = 21;
+                                break;
+                            case 16:
+                                index = 22;
+                                break;
+                            case 32:
+                                index = 23;
+                                break;
+                            case 64:
+                                index = 24;
+                                break;
+                            case 128:
+                                index = 25;
+                                break;
+                            default:
+                                break;
+                        }
+
+                        if (thickness > 1)
+                        {
+                            ADDR_ASSERT(bpp != 128);
+                            index += 5;
+                        }
+                    }
+                }
+                else
+                {
+                    ADDR_ASSERT(numSamples == 4);
+
+                    if (flags.depth)
+                    {
+                        switch (bpp)
+                        {
+                            case 16:
+                                index = 5;
+                                break;
+                            case 32:
+                                index = 7;
+                                break;
+                            default:
+                                ADDR_ASSERT_ALWAYS();
+                                break;
+                        }
+                    }
+                    else
+                    {
+                        switch (bpp)
+                        {
+                            case 8:
+                                index = 23;
+                                break;
+                            case 16:
+                                index = 24;
+                                break;
+                            case 32:
+                                index = 25;
+                                break;
+                            case 64:
+                                index = 30;
+                                break;
+                            default:
+                                ADDR_ASSERT_ALWAYS();
+                                break;
+                        }
+                    }
+                }
+            }//end of PRT part
+            // See table entries 0-7
+            else if (flags.depth || flags.stencil)
+            {
+                if (flags.compressZ)
+                {
+                    if (flags.stencil)
+                    {
+                        index = 0;
+                    }
+                    else
+                    {
+                        // optimal tile index for compressed depth/stencil.
+                        switch (numSamples)
+                        {
+                            case 1:
+                                index = 0;
+                                break;
+                            case 2:
+                            case 4:
+                                index = 1;
+                                break;
+                            case 8:
+                                index = 2;
+                                break;
+                            default:
+                                break;
+                        }
+                    }
+                }
+                else // unCompressZ
+                {
+                    index = 3;
+                }
+            }
+            else //non PRT & non Depth & non Stencil
+            {
+                // See table entries 9-12
+                if (inTileType == ADDR_DISPLAYABLE)
+                {
+                    switch (bpp)
+                    {
+                        case 8:
+                            index = 10;
+                            break;
+                        case 16:
+                            index = 11;
+                            break;
+                        case 32:
+                            index = 12;
+                            break;
+                        case 64:
+                            index = 12;
+                            break;
+                        default:
+                            break;
+                    }
+                }
+                else
+                {
+                    // See table entries 13-17
+                    if (thickness == 1)
+                    {
+                        if (flags.fmask)
+                        {
+                            UINT_32 fmaskPixelSize = bpp * numSamples;
+
+                            switch (fmaskPixelSize)
+                            {
+                                case 8:
+                                    index = 14;
+                                    break;
+                                case 16:
+                                    index = 15;
+                                    break;
+                                case 32:
+                                    index = 16;
+                                    break;
+                                case 64:
+                                    index = 17;
+                                    break;
+                                default:
+                                    ADDR_ASSERT_ALWAYS();
+                            }
+                        }
+                        else
+                        {
+                            switch (bpp)
+                            {
+                                case 8:
+                                    index = 14;
+                                    break;
+                                case 16:
+                                    index = 15;
+                                    break;
+                                case 32:
+                                    index = 16;
+                                    break;
+                                case 64:
+                                    index = 17;
+                                    break;
+                                case 128:
+                                    index = 17;
+                                    break;
+                                default:
+                                    break;
+                            }
+                        }
+                    }
+                    else // thick tiling - entries 18-20
+                    {
+                        switch (thickness)
+                        {
+                            case 4:
+                                index = 20;
+                                break;
+                            case 8:
+                                index = 19;
+                                break;
+                            default:
+                                break;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (tileMode == ADDR_TM_LINEAR_ALIGNED)
+            {
+                index = 8;
+            }
+            else if (tileMode == ADDR_TM_LINEAR_GENERAL)
+            {
+                index = TileIndexLinearGeneral;
+            }
+            else
+            {
+                if (flags.depth || flags.stencil)
+                {
+                    index = 4;
+                }
+                else if (inTileType == ADDR_DISPLAYABLE)
+                {
+                    index = 9;
+                }
+                else if (thickness == 1)
+                {
+                    index = 13;
+                }
+                else
+                {
+                    index = 18;
+                }
+            }
+        }
+
+        if (index >= 0 && index <= 31)
+        {
+            *pTileInfo      = m_tileTable[index].info;
+            pOut->tileType  = m_tileTable[index].type;
+        }
+
+        if (index == TileIndexLinearGeneral)
+        {
+            *pTileInfo      = m_tileTable[8].info;
+            pOut->tileType  = m_tileTable[8].type;
+        }
+    }
+    else
+    {
+        if (pTileInfoIn)
+        {
+            if (flags.stencil && pTileInfoIn->tileSplitBytes == 0)
+            {
+                // Stencil always uses index 0
+                *pTileInfo = m_tileTable[0].info;
+            }
+        }
+        // Pass through tile type
+        pOut->tileType = inTileType;
+    }
+
+    pOut->tileIndex = index;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::DecodeGbRegs
+*
+*   @brief
+*       Decodes GB_ADDR_CONFIG and noOfBanks/noOfRanks
+*
+*   @return
+*       TRUE if all settings are valid
+*
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::DecodeGbRegs(
+    const ADDR_REGISTER_VALUE* pRegValue) ///< [in] create input
+{
+    GB_ADDR_CONFIG  reg;
+    BOOL_32         valid = TRUE;
+
+    reg.val = pRegValue->gbAddrConfig;
+
+    switch (reg.f.pipe_interleave_size)
+    {
+        case ADDR_CONFIG_PIPE_INTERLEAVE_256B:
+            m_pipeInterleaveBytes = ADDR_PIPEINTERLEAVE_256B;
+            break;
+        case ADDR_CONFIG_PIPE_INTERLEAVE_512B:
+            m_pipeInterleaveBytes = ADDR_PIPEINTERLEAVE_512B;
+            break;
+        default:
+            valid = FALSE;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    switch (reg.f.row_size)
+    {
+        case ADDR_CONFIG_1KB_ROW:
+            m_rowSize = ADDR_ROWSIZE_1KB;
+            break;
+        case ADDR_CONFIG_2KB_ROW:
+            m_rowSize = ADDR_ROWSIZE_2KB;
+            break;
+        case ADDR_CONFIG_4KB_ROW:
+            m_rowSize = ADDR_ROWSIZE_4KB;
+            break;
+        default:
+            valid = FALSE;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    switch (pRegValue->noOfBanks)
+    {
+        case 0:
+            m_banks = 4;
+            break;
+        case 1:
+            m_banks = 8;
+            break;
+        case 2:
+            m_banks = 16;
+            break;
+        default:
+            valid = FALSE;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    switch (pRegValue->noOfRanks)
+    {
+        case 0:
+            m_ranks = 1;
+            break;
+        case 1:
+            m_ranks = 2;
+            break;
+        default:
+            valid = FALSE;
+            ADDR_UNHANDLED_CASE();
+            break;
+    }
+
+    m_logicalBanks = m_banks * m_ranks;
+
+    ADDR_ASSERT(m_logicalBanks <= 16);
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlInitGlobalParams
+*
+*   @brief
+*       Initializes global parameters
+*
+*   @return
+*       TRUE if all settings are valid
+*
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::HwlInitGlobalParams(
+    const ADDR_CREATE_INPUT* pCreateIn) ///< [in] create input
+{
+    BOOL_32 valid = TRUE;
+    const ADDR_REGISTER_VALUE* pRegValue = &pCreateIn->regValue;
+
+    valid = DecodeGbRegs(pRegValue);
+
+    if (valid)
+    {
+        if (m_settings.isTahiti || m_settings.isPitCairn)
+        {
+            m_pipes = 8;
+        }
+        else if (m_settings.isCapeVerde || m_settings.isOland)
+        {
+            m_pipes = 4;
+        }
+        else
+        {
+            // Hainan is 2-pipe (m_settings.isHainan == 1)
+            m_pipes = 2;
+        }
+
+        valid = InitTileSettingTable(pRegValue->pTileConfig, pRegValue->noOfEntries);
+
+        m_maxSamples = 16;
+    }
+
+    return valid;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlConvertTileInfoToHW
+*   @brief
+*       Entry of si's ConvertTileInfoToHW
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE SIAddrLib::HwlConvertTileInfoToHW(
+    const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn, ///< [in] input structure
+    ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut      ///< [out] output structure
+    ) const
+{
+    ADDR_E_RETURNCODE retCode   = ADDR_OK;
+
+    retCode = EgBasedAddrLib::HwlConvertTileInfoToHW(pIn, pOut);
+
+    if (retCode == ADDR_OK)
+    {
+        if (pIn->reverse == FALSE)
+        {
+            if (pIn->pTileInfo->pipeConfig == ADDR_PIPECFG_INVALID)
+            {
+                retCode = ADDR_INVALIDPARAMS;
+            }
+            else
+            {
+                pOut->pTileInfo->pipeConfig =
+                    static_cast<AddrPipeCfg>(pIn->pTileInfo->pipeConfig - 1);
+            }
+        }
+        else
+        {
+            pOut->pTileInfo->pipeConfig =
+                static_cast<AddrPipeCfg>(pIn->pTileInfo->pipeConfig + 1);
+        }
+    }
+
+    return retCode;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeXmaskCoordYFrom8Pipe
+*
+*   @brief
+*       Compute the Y coord which will be added to Xmask Y
+*       coord.
+*   @return
+*       Y coord
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlComputeXmaskCoordYFrom8Pipe(
+    UINT_32         pipe,       ///< [in] pipe id
+    UINT_32         x           ///< [in] tile coord x, which is original x coord / 8
+    ) const
+{
+    // This function should never be called since it is 6xx/8xx specfic.
+    // Keep this empty implementation to avoid any mis-use.
+    ADDR_ASSERT_ALWAYS();
+
+    return 0;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeSurfaceCoord2DFromBankPipe
+*
+*   @brief
+*       Compute surface x,y coordinates from bank/pipe info
+*   @return
+*       N/A
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlComputeSurfaceCoord2DFromBankPipe(
+    AddrTileMode        tileMode,   ///< [in] tile mode
+    UINT_32*            pX,         ///< [in/out] x coordinate
+    UINT_32*            pY,         ///< [in/out] y coordinate
+    UINT_32             slice,      ///< [in] slice index
+    UINT_32             bank,       ///< [in] bank number
+    UINT_32             pipe,       ///< [in] pipe number
+    UINT_32             bankSwizzle,///< [in] bank swizzle
+    UINT_32             pipeSwizzle,///< [in] pipe swizzle
+    UINT_32             tileSlices, ///< [in] slices in a micro tile
+    BOOL_32             ignoreSE,   ///< [in] TRUE if shader engines are ignored
+    ADDR_TILEINFO*      pTileInfo   ///< [in] bank structure. **All fields to be valid on entry**
+    ) const
+{
+    UINT_32 xBit;
+    UINT_32 yBit;
+    UINT_32 yBit3 = 0;
+    UINT_32 yBit4 = 0;
+    UINT_32 yBit5 = 0;
+    UINT_32 yBit6 = 0;
+
+    UINT_32 xBit3 = 0;
+    UINT_32 xBit4 = 0;
+    UINT_32 xBit5 = 0;
+
+    UINT_32 numPipes = GetPipePerSurf(pTileInfo->pipeConfig);
+
+    CoordFromBankPipe xyBits = {0};
+    ComputeSurfaceCoord2DFromBankPipe(tileMode, *pX, *pY, slice, bank, pipe,
+                                      bankSwizzle, pipeSwizzle, tileSlices, pTileInfo,
+                                      &xyBits);
+    yBit3 = xyBits.yBit3;
+    yBit4 = xyBits.yBit4;
+    yBit5 = xyBits.yBit5;
+    yBit6 = xyBits.yBit6;
+
+    xBit3 = xyBits.xBit3;
+    xBit4 = xyBits.xBit4;
+    xBit5 = xyBits.xBit5;
+
+    yBit = xyBits.yBits;
+
+    UINT_32 yBitTemp = 0;
+
+    if ((pTileInfo->pipeConfig == ADDR_PIPECFG_P4_32x32) ||
+        (pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x64_32x32))
+    {
+        ADDR_ASSERT(pTileInfo->bankWidth == 1 && pTileInfo->macroAspectRatio > 1);
+        UINT_32 yBitToCheck = QLog2(pTileInfo->banks) - 1;
+
+        ADDR_ASSERT(yBitToCheck <= 3);
+
+        yBitTemp = _BIT(yBit, yBitToCheck);
+
+        xBit3 = 0;
+    }
+
+    yBit = Bits2Number(4, yBit6, yBit5, yBit4, yBit3);
+    xBit = Bits2Number(3, xBit5, xBit4, xBit3);
+
+    *pY += yBit * pTileInfo->bankHeight * MicroTileHeight;
+    *pX += xBit * numPipes * pTileInfo->bankWidth * MicroTileWidth;
+
+    //calculate the bank and pipe bits in x, y
+    UINT_32 xTile; //x in micro tile
+    UINT_32 x3 = 0;
+    UINT_32 x4 = 0;
+    UINT_32 x5 = 0;
+    UINT_32 x6 = 0;
+    UINT_32 y = *pY;
+
+    UINT_32 pipeBit0 = _BIT(pipe,0);
+    UINT_32 pipeBit1 = _BIT(pipe,1);
+    UINT_32 pipeBit2 = _BIT(pipe,2);
+
+    UINT_32 y3 = _BIT(y, 3);
+    UINT_32 y4 = _BIT(y, 4);
+    UINT_32 y5 = _BIT(y, 5);
+    UINT_32 y6 = _BIT(y, 6);
+
+    // bankbit0 after ^x4^x5
+    UINT_32 bankBit00 = _BIT(bank,0);
+    UINT_32 bankBit0 = 0;
+
+    switch (pTileInfo->pipeConfig)
+    {
+        case ADDR_PIPECFG_P2:
+            x3 = pipeBit0 ^ y3;
+            break;
+        case ADDR_PIPECFG_P4_8x16:
+            x4 = pipeBit0 ^ y3;
+            x3 = pipeBit0 ^ y4;
+            break;
+        case ADDR_PIPECFG_P4_16x16:
+            x4 = pipeBit1 ^ y4;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P4_16x32:
+            x4 = pipeBit1 ^ y4;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P4_32x32:
+            x5 = pipeBit1 ^ y5;
+            x3 = pipeBit0 ^ y3 ^ x5;
+            bankBit0 = yBitTemp ^ x5;
+            x4 = bankBit00 ^ x5 ^ bankBit0;
+            *pX += x5 * 4 * 1 * 8; // x5 * num_pipes * bank_width * 8;
+            break;
+        case ADDR_PIPECFG_P8_16x16_8x16:
+            x3 = pipeBit1 ^ y5;
+            x4 = pipeBit2 ^ y4;
+            x5 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_16x32_8x16:
+            x3 = pipeBit1 ^ y4;
+            x4 = pipeBit2 ^ y5;
+            x5 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_32x32_8x16:
+            x3 = pipeBit1 ^ y4;
+            x5 = pipeBit2 ^ y5;
+            x4 = pipeBit0 ^ y3 ^ x5;
+            break;
+        case ADDR_PIPECFG_P8_16x32_16x16:
+            x4 = pipeBit2 ^ y5;
+            x5 = pipeBit1 ^ y4;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x16:
+            x5 = pipeBit2 ^ y5;
+            x4 = pipeBit1 ^ y4;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_32x32_16x32:
+            x5 = pipeBit2 ^ y5;
+            x4 = pipeBit1 ^ y6;
+            x3 = pipeBit0 ^ y3 ^ x4;
+            break;
+        case ADDR_PIPECFG_P8_32x64_32x32:
+            x6 = pipeBit1 ^ y5;
+            x5 = pipeBit2 ^ y6;
+            x3 = pipeBit0 ^ y3 ^ x5;
+            bankBit0 = yBitTemp ^ x6;
+            x4 = bankBit00 ^ x5 ^ bankBit0;
+            *pX += x6 * 8 * 1 * 8; // x6 * num_pipes * bank_width * 8;
+            break;
+        default:
+            ADDR_ASSERT_ALWAYS();
+    }
+
+    xTile = Bits2Number(3, x5, x4, x3);
+
+    *pX += xTile << 3;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlPreAdjustBank
+*
+*   @brief
+*       Adjust bank before calculating address acoording to bank/pipe
+*   @return
+*       Adjusted bank
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlPreAdjustBank(
+    UINT_32         tileX,      ///< [in] x coordinate in unit of tile
+    UINT_32         bank,       ///< [in] bank
+    ADDR_TILEINFO*  pTileInfo   ///< [in] tile info
+    ) const
+{
+    if (((pTileInfo->pipeConfig == ADDR_PIPECFG_P4_32x32) ||
+        (pTileInfo->pipeConfig == ADDR_PIPECFG_P8_32x64_32x32)) && (pTileInfo->bankWidth == 1))
+    {
+        UINT_32 bankBit0 = _BIT(bank, 0);
+        UINT_32 x4 = _BIT(tileX, 1);
+        UINT_32 x5 = _BIT(tileX, 2);
+
+        bankBit0 = bankBit0 ^ x4 ^ x5;
+        bank |= bankBit0;
+
+        ADDR_ASSERT(pTileInfo->macroAspectRatio > 1)
+    }
+
+    return bank;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeSurfaceInfo
+*
+*   @brief
+*       Entry of si's ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE SIAddrLib::HwlComputeSurfaceInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,    ///< [in] input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pOut    ///< [out] output structure
+    ) const
+{
+    pOut->tileIndex = pIn->tileIndex;
+
+    return EgBasedAddrLib::HwlComputeSurfaceInfo(pIn,pOut);
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeMipLevel
+*   @brief
+*       Compute MipLevel info (including level 0)
+*   @return
+*       TRUE if HWL's handled
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::HwlComputeMipLevel(
+    ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn ///< [in/out] Input structure
+    ) const
+{
+    // basePitch is calculated from level 0 so we only check this for mipLevel > 0
+    if (pIn->mipLevel > 0)
+    {
+        // Note: Don't check expand 3x formats(96 bit) as the basePitch is not pow2 even if
+        // we explicity set pow2Pad flag. The 3x base pitch is padded to pow2 but after being
+        // divided by expandX factor (3) - to program texture pitch, the basePitch is never pow2.
+        if (!AddrElemLib::IsExpand3x(pIn->format))
+        {
+            // Sublevel pitches are generated from base level pitch instead of width on SI
+            // If pow2Pad is 0, we don't assert - as this is not really used for a mip chain
+            ADDR_ASSERT(!pIn->flags.pow2Pad || ((pIn->basePitch != 0) && IsPow2(pIn->basePitch)));
+        }
+
+        if (pIn->basePitch != 0)
+        {
+            pIn->width = Max(1u, pIn->basePitch >> pIn->mipLevel);
+        }
+    }
+
+    // pow2Pad is done in PostComputeMipLevel
+
+    return TRUE;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlCheckLastMacroTiledLvl
+*
+*   @brief
+*       Sets pOut->last2DLevel to TRUE if it is
+*   @note
+*
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlCheckLastMacroTiledLvl(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, ///< [in] Input structure
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut      ///< [in/out] Output structure (used as input, too)
+    ) const
+{
+    // pow2Pad covers all mipmap cases
+    if (pIn->flags.pow2Pad)
+    {
+        ADDR_ASSERT(IsMacroTiled(pIn->tileMode));
+
+        UINT_32 nextPitch;
+        UINT_32 nextHeight;
+        UINT_32 nextSlices;
+
+        AddrTileMode nextTileMode;
+
+        if (pIn->mipLevel == 0 || pIn->basePitch == 0)
+        {
+            // Base level or fail-safe case (basePitch == 0)
+            nextPitch = pOut->pitch >> 1;
+        }
+        else
+        {
+            // Sub levels
+            nextPitch = pIn->basePitch >> (pIn->mipLevel + 1);
+        }
+
+        // nextHeight must be shifted from this level's original height rather than a pow2 padded
+        // one but this requires original height stored somewhere (pOut->height)
+        ADDR_ASSERT(pOut->height != 0);
+
+        // next level's height is just current level's >> 1 in pixels
+        nextHeight = pOut->height >> 1;
+        // Special format such as FMT_1 and FMT_32_32_32 can be linear only so we consider block
+        // compressed foramts
+        if (AddrElemLib::IsBlockCompressed(pIn->format))
+        {
+            nextHeight = (nextHeight + 3) / 4;
+        }
+        nextHeight = NextPow2(nextHeight);
+
+        // nextSlices may be 0 if this level's is 1
+        if (pIn->flags.volume)
+        {
+            nextSlices = Max(1u, pIn->numSlices >> 1);
+        }
+        else
+        {
+            nextSlices = pIn->numSlices;
+        }
+
+        nextTileMode = ComputeSurfaceMipLevelTileMode(pIn->tileMode,
+                                                      pIn->bpp,
+                                                      nextPitch,
+                                                      nextHeight,
+                                                      nextSlices,
+                                                      pIn->numSamples,
+                                                      pOut->pitchAlign,
+                                                      pOut->heightAlign,
+                                                      pOut->pTileInfo);
+
+        pOut->last2DLevel = IsMicroTiled(nextTileMode);
+    }
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlDegradeThickTileMode
+*
+*   @brief
+*       Degrades valid tile mode for thick modes if needed
+*
+*   @return
+*       Suitable tile mode
+***************************************************************************************************
+*/
+AddrTileMode SIAddrLib::HwlDegradeThickTileMode(
+    AddrTileMode        baseTileMode,   ///< [in] base tile mode
+    UINT_32             numSlices,      ///< [in] current number of slices
+    UINT_32*            pBytesPerTile   ///< [in/out] pointer to bytes per slice
+    ) const
+{
+    return EgBasedAddrLib::HwlDegradeThickTileMode(baseTileMode, numSlices, pBytesPerTile);
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlTileInfoEqual
+*
+*   @brief
+*       Return TRUE if all field are equal
+*   @note
+*       Only takes care of current HWL's data
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::HwlTileInfoEqual(
+    const ADDR_TILEINFO* pLeft, ///<[in] Left compare operand
+    const ADDR_TILEINFO* pRight ///<[in] Right compare operand
+    ) const
+{
+    BOOL_32 equal = FALSE;
+
+    if (pLeft->pipeConfig == pRight->pipeConfig)
+    {
+        equal =  EgBasedAddrLib::HwlTileInfoEqual(pLeft, pRight);
+    }
+
+    return equal;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::GetTileSettings
+*
+*   @brief
+*       Get tile setting infos by index.
+*   @return
+*       Tile setting info.
+***************************************************************************************************
+*/
+const ADDR_TILECONFIG* SIAddrLib::GetTileSetting(
+    UINT_32 index          ///< [in] Tile index
+    ) const
+{
+    ADDR_ASSERT(index < m_noOfEntries);
+    return &m_tileTable[index];
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlPostCheckTileIndex
+*
+*   @brief
+*       Map a tile setting to index if curIndex is invalid, otherwise check if curIndex matches
+*       tile mode/type/info and change the index if needed
+*   @return
+*       Tile index.
+***************************************************************************************************
+*/
+INT_32 SIAddrLib::HwlPostCheckTileIndex(
+    const ADDR_TILEINFO* pInfo,     ///< [in] Tile Info
+    AddrTileMode         mode,      ///< [in] Tile mode
+    AddrTileType         type,      ///< [in] Tile type
+    INT                  curIndex   ///< [in] Current index assigned in HwlSetupTileInfo
+    ) const
+{
+    INT_32 index = curIndex;
+
+    if (mode == ADDR_TM_LINEAR_GENERAL)
+    {
+        index = TileIndexLinearGeneral;
+    }
+    else
+    {
+        BOOL_32 macroTiled = IsMacroTiled(mode);
+
+        // We need to find a new index if either of them is true
+        // 1. curIndex is invalid
+        // 2. tile mode is changed
+        // 3. tile info does not match for macro tiled
+        if ((index == TileIndexInvalid         ||
+            (mode != m_tileTable[index].mode)  ||
+            (macroTiled && !HwlTileInfoEqual(pInfo, &m_tileTable[index].info))))
+        {
+            for (index = 0; index < static_cast<INT_32>(m_noOfEntries); index++)
+            {
+                if (macroTiled)
+                {
+                    // macro tile modes need all to match
+                    if (HwlTileInfoEqual(pInfo, &m_tileTable[index].info) &&
+                        (mode == m_tileTable[index].mode)                 &&
+                        (type == m_tileTable[index].type))
+                    {
+                        break;
+                    }
+                }
+                else if (mode == ADDR_TM_LINEAR_ALIGNED)
+                {
+                    // linear mode only needs tile mode to match
+                    if (mode == m_tileTable[index].mode)
+                    {
+                        break;
+                    }
+                }
+                else
+                {
+                    // micro tile modes only need tile mode and tile type to match
+                    if (mode == m_tileTable[index].mode &&
+                        type == m_tileTable[index].type)
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    ADDR_ASSERT(index < static_cast<INT_32>(m_noOfEntries));
+
+    if (index >= static_cast<INT_32>(m_noOfEntries))
+    {
+        index = TileIndexInvalid;
+    }
+
+    return index;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlSetupTileCfg
+*
+*   @brief
+*       Map tile index to tile setting.
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE SIAddrLib::HwlSetupTileCfg(
+    INT_32          index,          ///< [in] Tile index
+    INT_32          macroModeIndex, ///< [in] Index in macro tile mode table(CI)
+    ADDR_TILEINFO*  pInfo,          ///< [out] Tile Info
+    AddrTileMode*   pMode,          ///< [out] Tile mode
+    AddrTileType*   pType          ///< [out] Tile type
+    ) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    // Global flag to control usage of tileIndex
+    if (UseTileIndex(index))
+    {
+        if (index == TileIndexLinearGeneral)
+        {
+            if (pMode)
+            {
+                *pMode = ADDR_TM_LINEAR_GENERAL;
+            }
+
+            if (pType)
+            {
+                *pType = ADDR_DISPLAYABLE;
+            }
+
+            if (pInfo)
+            {
+                pInfo->banks = 2;
+                pInfo->bankWidth = 1;
+                pInfo->bankHeight = 1;
+                pInfo->macroAspectRatio = 1;
+                pInfo->tileSplitBytes = 64;
+                pInfo->pipeConfig = ADDR_PIPECFG_P2;
+            }
+        }
+        else if (static_cast<UINT_32>(index) >= m_noOfEntries)
+        {
+            returnCode = ADDR_INVALIDPARAMS;
+        }
+        else
+        {
+            const ADDR_TILECONFIG* pCfgTable = GetTileSetting(index);
+
+            if (pInfo)
+            {
+                *pInfo = pCfgTable->info;
+            }
+            else
+            {
+                if (IsMacroTiled(pCfgTable->mode))
+                {
+                    returnCode = ADDR_INVALIDPARAMS;
+                }
+            }
+
+            if (pMode)
+            {
+                *pMode = pCfgTable->mode;
+            }
+
+            if (pType)
+            {
+                *pType = pCfgTable->type;
+            }
+        }
+    }
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::ReadGbTileMode
+*
+*   @brief
+*       Convert GB_TILE_MODE HW value to ADDR_TILE_CONFIG.
+*   @return
+*       NA.
+***************************************************************************************************
+*/
+VOID SIAddrLib::ReadGbTileMode(
+    UINT_32             regValue,   ///< [in] GB_TILE_MODE register
+    ADDR_TILECONFIG*    pCfg        ///< [out] output structure
+    ) const
+{
+    GB_TILE_MODE gbTileMode;
+    gbTileMode.val = regValue;
+
+    pCfg->type = static_cast<AddrTileType>(gbTileMode.f.micro_tile_mode);
+    pCfg->info.bankHeight = 1 << gbTileMode.f.bank_height;
+    pCfg->info.bankWidth = 1 << gbTileMode.f.bank_width;
+    pCfg->info.banks = 1 << (gbTileMode.f.num_banks + 1);
+    pCfg->info.macroAspectRatio = 1 << gbTileMode.f.macro_tile_aspect;
+    pCfg->info.tileSplitBytes = 64 << gbTileMode.f.tile_split;
+    pCfg->info.pipeConfig = static_cast<AddrPipeCfg>(gbTileMode.f.pipe_config + 1);
+
+    UINT_32 regArrayMode = gbTileMode.f.array_mode;
+
+    pCfg->mode = static_cast<AddrTileMode>(regArrayMode);
+
+    if (regArrayMode == 8) //ARRAY_2D_TILED_XTHICK
+    {
+        pCfg->mode = ADDR_TM_2D_TILED_XTHICK;
+    }
+    else if (regArrayMode >= 14) //ARRAY_3D_TILED_XTHICK
+    {
+        pCfg->mode = static_cast<AddrTileMode>(pCfg->mode + 3);
+    }
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::InitTileSettingTable
+*
+*   @brief
+*       Initialize the ADDR_TILE_CONFIG table.
+*   @return
+*       TRUE if tile table is correctly initialized
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::InitTileSettingTable(
+    const UINT_32*  pCfg,           ///< [in] Pointer to table of tile configs
+    UINT_32         noOfEntries     ///< [in] Numbe of entries in the table above
+    )
+{
+    BOOL_32 initOk = TRUE;
+
+    ADDR_ASSERT(noOfEntries <= TileTableSize);
+
+    memset(m_tileTable, 0, sizeof(m_tileTable));
+
+    if (noOfEntries != 0)
+    {
+        m_noOfEntries = noOfEntries;
+    }
+    else
+    {
+        m_noOfEntries = TileTableSize;
+    }
+
+    if (pCfg) // From Client
+    {
+        for (UINT_32 i = 0; i < m_noOfEntries; i++)
+        {
+            ReadGbTileMode(*(pCfg + i), &m_tileTable[i]);
+        }
+    }
+    else
+    {
+        ADDR_ASSERT_ALWAYS();
+        initOk = FALSE;
+    }
+
+    if (initOk)
+    {
+        ADDR_ASSERT(m_tileTable[TILEINDEX_LINEAR_ALIGNED].mode == ADDR_TM_LINEAR_ALIGNED);
+    }
+
+    return initOk;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlGetTileIndex
+*
+*   @brief
+*       Return the virtual/real index for given mode/type/info
+*   @return
+*       ADDR_OK if successful.
+***************************************************************************************************
+*/
+ADDR_E_RETURNCODE SIAddrLib::HwlGetTileIndex(
+    const ADDR_GET_TILEINDEX_INPUT* pIn,
+    ADDR_GET_TILEINDEX_OUTPUT*      pOut) const
+{
+    ADDR_E_RETURNCODE returnCode = ADDR_OK;
+
+    pOut->index = HwlPostCheckTileIndex(pIn->pTileInfo, pIn->tileMode, pIn->tileType);
+
+    return returnCode;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlFmaskPreThunkSurfInfo
+*
+*   @brief
+*       Some preparation before thunking a ComputeSurfaceInfo call for Fmask
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlFmaskPreThunkSurfInfo(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT*    pFmaskIn,   ///< [in] Input of fmask info
+    const ADDR_COMPUTE_FMASK_INFO_OUTPUT*   pFmaskOut,  ///< [in] Output of fmask info
+    ADDR_COMPUTE_SURFACE_INFO_INPUT*        pSurfIn,    ///< [out] Input of thunked surface info
+    ADDR_COMPUTE_SURFACE_INFO_OUTPUT*       pSurfOut    ///< [out] Output of thunked surface info
+    ) const
+{
+    pSurfIn->tileIndex = pFmaskIn->tileIndex;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlFmaskPostThunkSurfInfo
+*
+*   @brief
+*       Copy hwl extra field after calling thunked ComputeSurfaceInfo
+*   @return
+*       ADDR_E_RETURNCODE
+***************************************************************************************************
+*/
+VOID SIAddrLib::HwlFmaskPostThunkSurfInfo(
+    const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,   ///< [in] Output of surface info
+    ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut           ///< [out] Output of fmask info
+    ) const
+{
+    pFmaskOut->macroModeIndex = TileIndexInvalid;
+    pFmaskOut->tileIndex = pSurfOut->tileIndex;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlComputeFmaskBits
+*   @brief
+*       Computes fmask bits
+*   @return
+*       Fmask bits
+***************************************************************************************************
+*/
+UINT_32 SIAddrLib::HwlComputeFmaskBits(
+    const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+    UINT_32* pNumSamples
+    ) const
+{
+    UINT_32 numSamples = pIn->numSamples;
+    UINT_32 numFrags = GetNumFragments(numSamples, pIn->numFrags);
+    UINT_32 bpp;
+
+    if (numFrags != numSamples) // EQAA
+    {
+        ADDR_ASSERT(numFrags <= 8);
+
+        if (!pIn->resolved)
+        {
+            if (numFrags == 1)
+            {
+                bpp          = 1;
+                numSamples   = numSamples == 16 ? 16 : 8;
+            }
+            else if (numFrags == 2)
+            {
+                ADDR_ASSERT(numSamples >= 4);
+
+                bpp          = 2;
+                numSamples   = numSamples;
+            }
+            else if (numFrags == 4)
+            {
+                ADDR_ASSERT(numSamples >= 4);
+
+                bpp          = 4;
+                numSamples   = numSamples;
+            }
+            else // numFrags == 8
+            {
+                ADDR_ASSERT(numSamples == 16);
+
+                bpp          = 4;
+                numSamples   = numSamples;
+            }
+        }
+        else
+        {
+            if (numFrags == 1)
+            {
+                bpp          = (numSamples == 16) ? 16 : 8;
+                numSamples   = 1;
+            }
+            else if (numFrags == 2)
+            {
+                ADDR_ASSERT(numSamples >= 4);
+
+                bpp          = numSamples*2;
+                numSamples   = 1;
+            }
+            else if (numFrags == 4)
+            {
+                ADDR_ASSERT(numSamples >= 4);
+
+                bpp          = numSamples*4;
+                numSamples   = 1;
+            }
+            else // numFrags == 8
+            {
+                ADDR_ASSERT(numSamples >= 16);
+
+                bpp          = 16*4;
+                numSamples   = 1;
+            }
+        }
+    }
+    else // Normal AA
+    {
+        if (!pIn->resolved)
+        {
+            bpp          = ComputeFmaskNumPlanesFromNumSamples(numSamples);
+            numSamples   = numSamples == 2 ? 8 : numSamples;
+        }
+        else
+        {
+            // The same as 8XX
+            bpp          = ComputeFmaskResolvedBppFromNumSamples(numSamples);
+            numSamples   = 1; // 1x sample
+        }
+    }
+
+    SafeAssign(pNumSamples, numSamples);
+
+    return bpp;
+}
+
+/**
+***************************************************************************************************
+*   SIAddrLib::HwlOverrideTileMode
+*
+*   @brief
+*       Override tile modes (for PRT only, avoid client passes in an invalid PRT mode for SI.
+*
+*   @return
+*       Suitable tile mode
+*
+***************************************************************************************************
+*/
+BOOL_32 SIAddrLib::HwlOverrideTileMode(
+    const ADDR_COMPUTE_SURFACE_INFO_INPUT*  pIn,       ///< [in] input structure
+    AddrTileMode*                           pTileMode, ///< [in/out] pointer to the tile mode
+    AddrTileType*                           pTileType  ///< [in/out] pointer to the tile type
+    ) const
+{
+    BOOL_32 bOverrided = FALSE;
+    AddrTileMode tileMode = *pTileMode;
+
+    switch (tileMode)
+    {
+        case ADDR_TM_PRT_TILED_THIN1:
+            tileMode    = ADDR_TM_2D_TILED_THIN1;
+            break;
+
+        case ADDR_TM_PRT_TILED_THICK:
+            tileMode    = ADDR_TM_2D_TILED_THICK;
+            break;
+
+        case ADDR_TM_PRT_2D_TILED_THICK:
+            tileMode    = ADDR_TM_2D_TILED_THICK;
+            break;
+
+        case ADDR_TM_PRT_3D_TILED_THICK:
+            tileMode    = ADDR_TM_3D_TILED_THICK;
+            break;
+
+        default:
+            break;
+    }
+
+    if (tileMode != *pTileMode)
+    {
+        *pTileMode = tileMode;
+        bOverrided = TRUE;
+        ADDR_ASSERT(pIn->flags.prt == TRUE);
+    }
+
+    return bOverrided;
+}
+
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.h
new file mode 100644
index 00000000000..897beb1bb92
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/siaddrlib.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+***************************************************************************************************
+* @file  siaddrlib.h
+* @brief Contains the R800AddrLib class definition.
+***************************************************************************************************
+*/
+
+#ifndef __SI_ADDR_LIB_H__
+#define __SI_ADDR_LIB_H__
+
+#include "addrlib.h"
+#include "egbaddrlib.h"
+
+/**
+***************************************************************************************************
+* @brief Describes the information in tile mode table
+***************************************************************************************************
+*/
+struct ADDR_TILECONFIG
+{
+    AddrTileMode  mode;
+    AddrTileType  type;
+    ADDR_TILEINFO info;
+};
+
+/**
+***************************************************************************************************
+* @brief SI specific settings structure.
+***************************************************************************************************
+*/
+struct SIChipSettings
+{
+    struct
+    {
+        UINT_32 isSouthernIsland    : 1;
+        UINT_32 isTahiti            : 1;
+        UINT_32 isPitCairn          : 1;
+        UINT_32 isCapeVerde         : 1;
+        /// Oland/Hainan are of GFXIP 6.0, similar with SI
+        UINT_32 isOland             : 1;
+        UINT_32 isHainan            : 1;
+    };
+};
+
+/**
+***************************************************************************************************
+* @brief This class is the SI specific address library
+*        function set.
+***************************************************************************************************
+*/
+class SIAddrLib : public EgBasedAddrLib
+{
+public:
+    /// Creates SIAddrLib object
+    static AddrLib* CreateObj(const AddrClient* pClient)
+    {
+        return new(pClient) SIAddrLib(pClient);
+    }
+
+protected:
+    SIAddrLib(const AddrClient* pClient);
+    virtual ~SIAddrLib();
+
+    // Hwl interface - defined in AddrLib
+    virtual ADDR_E_RETURNCODE HwlComputeSurfaceInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual ADDR_E_RETURNCODE HwlConvertTileInfoToHW(
+        const ADDR_CONVERT_TILEINFOTOHW_INPUT* pIn,
+        ADDR_CONVERT_TILEINFOTOHW_OUTPUT* pOut) const;
+
+    virtual UINT_64 HwlComputeXmaskAddrFromCoord(
+        UINT_32 pitch, UINT_32 height, UINT_32 x, UINT_32 y, UINT_32 slice, UINT_32 numSlices,
+        UINT_32 factor, BOOL_32 isLinear, BOOL_32 isWidth8, BOOL_32 isHeight8,
+        ADDR_TILEINFO* pTileInfo, UINT_32* pBitPosition) const;
+
+    virtual VOID HwlComputeXmaskCoordFromAddr(
+        UINT_64 addr, UINT_32 bitPosition, UINT_32 pitch, UINT_32 height, UINT_32 numSlices,
+        UINT_32 factor, BOOL_32 isLinear, BOOL_32 isWidth8, BOOL_32 isHeight8,
+        ADDR_TILEINFO* pTileInfo, UINT_32* pX, UINT_32* pY, UINT_32* pSlice) const;
+
+    virtual ADDR_E_RETURNCODE HwlGetTileIndex(
+        const ADDR_GET_TILEINDEX_INPUT* pIn,
+        ADDR_GET_TILEINDEX_OUTPUT*      pOut) const;
+
+    virtual BOOL_32 HwlComputeMipLevel(
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn) const;
+
+    virtual AddrChipFamily HwlConvertChipFamily(
+        UINT_32 uChipFamily, UINT_32 uChipRevision);
+
+    virtual BOOL_32 HwlInitGlobalParams(
+        const ADDR_CREATE_INPUT* pCreateIn);
+
+    virtual ADDR_E_RETURNCODE HwlSetupTileCfg(
+        INT_32 index, INT_32 macroModeIndex,
+        ADDR_TILEINFO* pInfo, AddrTileMode* pMode = 0, AddrTileType* pType = 0) const;
+
+    virtual VOID HwlComputeTileDataWidthAndHeightLinear(
+        UINT_32* pMacroWidth, UINT_32* pMacroHeight,
+        UINT_32 bpp, ADDR_TILEINFO* pTileInfo) const;
+
+    virtual UINT_64 HwlComputeHtileBytes(
+        UINT_32 pitch, UINT_32 height, UINT_32 bpp,
+        BOOL_32 isLinear, UINT_32 numSlices, UINT_64* pSliceBytes, UINT_32 baseAlign) const;
+
+    virtual UINT_32 ComputePipeFromCoord(
+        UINT_32 x, UINT_32 y, UINT_32 slice,
+        AddrTileMode tileMode, UINT_32 pipeSwizzle, BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    virtual UINT_32 HwlGetPipes(const ADDR_TILEINFO* pTileInfo) const;
+
+    /// Pre-handler of 3x pitch (96 bit) adjustment
+    virtual UINT_32 HwlPreHandleBaseLvl3xPitch(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, UINT_32 expPitch) const;
+    /// Post-handler of 3x pitch adjustment
+    virtual UINT_32 HwlPostHandleBaseLvl3xPitch(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, UINT_32 expPitch) const;
+
+    /// Dummy function to finalize the inheritance
+    virtual UINT_32 HwlComputeXmaskCoordYFrom8Pipe(
+        UINT_32 pipe, UINT_32 x) const;
+
+    // Sub-hwl interface - defined in EgBasedAddrLib
+    virtual VOID HwlSetupTileInfo(
+        AddrTileMode tileMode, ADDR_SURFACE_FLAGS flags,
+        UINT_32 bpp, UINT_32 pitch, UINT_32 height, UINT_32 numSamples,
+        ADDR_TILEINFO* inputTileInfo, ADDR_TILEINFO* outputTileInfo,
+        AddrTileType inTileType, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual UINT_32 HwlGetPitchAlignmentMicroTiled(
+        AddrTileMode tileMode, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples) const;
+
+    virtual UINT_64 HwlGetSizeAdjustmentMicroTiled(
+        UINT_32 thickness, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32 baseAlign, UINT_32 pitchAlign,
+        UINT_32 *pPitch, UINT_32 *pHeight) const;
+
+    virtual VOID HwlCheckLastMacroTiledLvl(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn, ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pOut) const;
+
+    virtual BOOL_32 HwlTileInfoEqual(
+        const ADDR_TILEINFO* pLeft, const ADDR_TILEINFO* pRight) const;
+
+    virtual AddrTileMode HwlDegradeThickTileMode(
+        AddrTileMode baseTileMode, UINT_32 numSlices, UINT_32* pBytesPerTile) const;
+
+    virtual BOOL_32 HwlOverrideTileMode(
+        const ADDR_COMPUTE_SURFACE_INFO_INPUT* pIn,
+        AddrTileMode* pTileMode,
+        AddrTileType* pTileType) const;
+
+    virtual BOOL_32 HwlSanityCheckMacroTiled(
+        ADDR_TILEINFO* pTileInfo) const
+    {
+        return TRUE;
+    }
+
+    virtual UINT_32 HwlGetPitchAlignmentLinear(UINT_32 bpp, ADDR_SURFACE_FLAGS flags) const;
+
+    virtual UINT_64 HwlGetSizeAdjustmentLinear(
+        AddrTileMode tileMode,
+        UINT_32 bpp, UINT_32 numSamples, UINT_32 baseAlign, UINT_32 pitchAlign,
+        UINT_32 *pPitch, UINT_32 *pHeight, UINT_32 *pHeightAlign) const;
+
+    virtual VOID HwlComputeSurfaceCoord2DFromBankPipe(
+        AddrTileMode tileMode, UINT_32* pX, UINT_32* pY, UINT_32 slice,
+        UINT_32 bank, UINT_32 pipe,
+        UINT_32 bankSwizzle, UINT_32 pipeSwizzle, UINT_32 tileSlices,
+        BOOL_32 ignoreSE,
+        ADDR_TILEINFO* pTileInfo) const;
+
+    virtual UINT_32 HwlPreAdjustBank(
+        UINT_32 tileX, UINT_32 bank, ADDR_TILEINFO* pTileInfo) const;
+
+    virtual INT_32 HwlPostCheckTileIndex(
+        const ADDR_TILEINFO* pInfo, AddrTileMode mode, AddrTileType type,
+        INT curIndex = TileIndexInvalid) const;
+
+    virtual VOID   HwlFmaskPreThunkSurfInfo(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pFmaskIn,
+        const ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut,
+        ADDR_COMPUTE_SURFACE_INFO_INPUT* pSurfIn,
+        ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut) const;
+
+    virtual VOID   HwlFmaskPostThunkSurfInfo(
+        const ADDR_COMPUTE_SURFACE_INFO_OUTPUT* pSurfOut,
+        ADDR_COMPUTE_FMASK_INFO_OUTPUT* pFmaskOut) const;
+
+    virtual UINT_32 HwlComputeFmaskBits(
+        const ADDR_COMPUTE_FMASK_INFO_INPUT* pIn,
+        UINT_32* pNumSamples) const;
+
+    virtual BOOL_32 HwlReduceBankWidthHeight(
+        UINT_32 tileSize, UINT_32 bpp, ADDR_SURFACE_FLAGS flags, UINT_32 numSamples,
+        UINT_32 bankHeightAlign, UINT_32 pipes,
+        ADDR_TILEINFO* pTileInfo) const
+    {
+        return TRUE;
+    }
+
+    // Protected non-virtual functions
+    VOID ComputeTileCoordFromPipeAndElemIdx(
+        UINT_32 elemIdx, UINT_32 pipe, AddrPipeCfg pipeCfg, UINT_32 pitchInMacroTile,
+        UINT_32 x, UINT_32 y, UINT_32* pX, UINT_32* pY) const;
+
+    UINT_32 TileCoordToMaskElementIndex(
+        UINT_32 tx, UINT_32 ty, AddrPipeCfg  pipeConfig,
+        UINT_32 *macroShift, UINT_32 *elemIdxBits) const;
+
+    BOOL_32 DecodeGbRegs(
+        const ADDR_REGISTER_VALUE* pRegValue);
+
+    const ADDR_TILECONFIG* GetTileSetting(
+        UINT_32 index) const;
+
+    static const UINT_32    TileTableSize = 32;
+    ADDR_TILECONFIG         m_tileTable[TileTableSize];
+    UINT_32                 m_noOfEntries;
+
+private:
+
+    UINT_32 GetPipePerSurf(AddrPipeCfg pipeConfig) const;
+
+    VOID ReadGbTileMode(
+        UINT_32 regValue, ADDR_TILECONFIG* pCfg) const;
+    BOOL_32 InitTileSettingTable(
+        const UINT_32 *pSetting, UINT_32 noOfEntries);
+
+    SIChipSettings          m_settings;
+};
+
+#endif
+
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
new file mode 100644
index 00000000000..50c42e3599a
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -0,0 +1,781 @@
+/*
+ * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "amdgpu_cs.h"
+
+#include "os/os_time.h"
+#include "state_tracker/drm_driver.h"
+#include <amdgpu_drm.h>
+#include <xf86drm.h>
+#include <stdio.h>
+
+static const struct pb_vtbl amdgpu_winsys_bo_vtbl;
+
+static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
+{
+   assert(bo->vtbl == &amdgpu_winsys_bo_vtbl);
+   return (struct amdgpu_winsys_bo *)bo;
+}
+
+struct amdgpu_bomgr {
+   struct pb_manager base;
+   struct amdgpu_winsys *rws;
+};
+
+static struct amdgpu_winsys *get_winsys(struct pb_manager *mgr)
+{
+   return ((struct amdgpu_bomgr*)mgr)->rws;
+}
+
+static struct amdgpu_winsys_bo *get_amdgpu_winsys_bo(struct pb_buffer *_buf)
+{
+   struct amdgpu_winsys_bo *bo = NULL;
+
+   if (_buf->vtbl == &amdgpu_winsys_bo_vtbl) {
+      bo = amdgpu_winsys_bo(_buf);
+   } else {
+      struct pb_buffer *base_buf;
+      pb_size offset;
+      pb_get_base_buffer(_buf, &base_buf, &offset);
+
+      if (base_buf->vtbl == &amdgpu_winsys_bo_vtbl)
+         bo = amdgpu_winsys_bo(base_buf);
+   }
+
+   return bo;
+}
+
+static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
+                           enum radeon_bo_usage usage)
+{
+   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
+   struct amdgpu_winsys *ws = bo->rws;
+   int i;
+
+   if (bo->is_shared) {
+      /* We can't use user fences for shared buffers, because user fences
+       * are local to this process only. If we want to wait for all buffer
+       * uses in all processes, we have to use amdgpu_bo_wait_for_idle.
+       */
+      bool buffer_busy = true;
+      int r;
+
+      r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
+      if (r)
+         fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
+                 r);
+      return !buffer_busy;
+   }
+
+   if (timeout == 0) {
+      /* Timeout == 0 is quite simple. */
+      pipe_mutex_lock(ws->bo_fence_lock);
+      for (i = 0; i < RING_LAST; i++)
+         if (bo->fence[i]) {
+            if (amdgpu_fence_wait(bo->fence[i], 0, false)) {
+               /* Release the idle fence to avoid checking it again later. */
+               amdgpu_fence_reference(&bo->fence[i], NULL);
+            } else {
+               pipe_mutex_unlock(ws->bo_fence_lock);
+               return false;
+            }
+         }
+      pipe_mutex_unlock(ws->bo_fence_lock);
+      return true;
+
+   } else {
+      struct pipe_fence_handle *fence[RING_LAST] = {};
+      bool fence_idle[RING_LAST] = {};
+      bool buffer_idle = true;
+      int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+      /* Take references to all fences, so that we can wait for them
+       * without the lock. */
+      pipe_mutex_lock(ws->bo_fence_lock);
+      for (i = 0; i < RING_LAST; i++)
+         amdgpu_fence_reference(&fence[i], bo->fence[i]);
+      pipe_mutex_unlock(ws->bo_fence_lock);
+
+      /* Now wait for the fences. */
+      for (i = 0; i < RING_LAST; i++) {
+         if (fence[i]) {
+            if (amdgpu_fence_wait(fence[i], abs_timeout, true))
+               fence_idle[i] = true;
+            else
+               buffer_idle = false;
+         }
+      }
+
+      /* Release idle fences to avoid checking them again later. */
+      pipe_mutex_lock(ws->bo_fence_lock);
+      for (i = 0; i < RING_LAST; i++) {
+         if (fence[i] == bo->fence[i] && fence_idle[i])
+            amdgpu_fence_reference(&bo->fence[i], NULL);
+
+         amdgpu_fence_reference(&fence[i], NULL);
+      }
+      pipe_mutex_unlock(ws->bo_fence_lock);
+
+      return buffer_idle;
+   }
+}
+
+static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
+      struct radeon_winsys_cs_handle *buf)
+{
+   return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
+}
+
+static void amdgpu_bo_destroy(struct pb_buffer *_buf)
+{
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+   int i;
+
+   amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
+   amdgpu_va_range_free(bo->va_handle);
+   amdgpu_bo_free(bo->bo);
+
+   for (i = 0; i < RING_LAST; i++)
+      amdgpu_fence_reference(&bo->fence[i], NULL);
+
+   if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+      bo->rws->allocated_vram -= align(bo->base.size, bo->rws->gart_page_size);
+   else if (bo->initial_domain & RADEON_DOMAIN_GTT)
+      bo->rws->allocated_gtt -= align(bo->base.size, bo->rws->gart_page_size);
+   FREE(bo);
+}
+
+static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf,
+                           struct radeon_winsys_cs *rcs,
+                           enum pipe_transfer_usage usage)
+{
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
+   struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
+   int r;
+   void *cpu = NULL;
+
+   /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         if (!(usage & PIPE_TRANSFER_WRITE)) {
+            /* Mapping for read.
+             *
+             * Since we are mapping for read, we don't need to wait
+             * if the GPU is using the buffer for read too
+             * (neither one is changing it).
+             *
+             * Only check whether the buffer is being used for write. */
+            if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
+                                                               RADEON_USAGE_WRITE)) {
+               cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
+               return NULL;
+            }
+
+            if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
+                                RADEON_USAGE_WRITE)) {
+               return NULL;
+            }
+         } else {
+            if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
+               cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
+               return NULL;
+            }
+
+            if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
+                                RADEON_USAGE_READWRITE)) {
+               return NULL;
+            }
+         }
+      } else {
+         uint64_t time = os_time_get_nano();
+
+         if (!(usage & PIPE_TRANSFER_WRITE)) {
+            /* Mapping for read.
+             *
+             * Since we are mapping for read, we don't need to wait
+             * if the GPU is using the buffer for read too
+             * (neither one is changing it).
+             *
+             * Only check whether the buffer is being used for write. */
+            if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
+                                                               RADEON_USAGE_WRITE)) {
+               cs->flush_cs(cs->flush_data, 0, NULL);
+            }
+            amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
+                           RADEON_USAGE_WRITE);
+         } else {
+            /* Mapping for write. */
+            if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo))
+               cs->flush_cs(cs->flush_data, 0, NULL);
+
+            amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
+                           RADEON_USAGE_READWRITE);
+         }
+
+         bo->rws->buffer_wait_time += os_time_get_nano() - time;
+      }
+   }
+
+   /* If the buffer is created from user memory, return the user pointer. */
+   if (bo->user_ptr)
+       return bo->user_ptr;
+
+   r = amdgpu_bo_cpu_map(bo->bo, &cpu);
+   return r ? NULL : cpu;
+}
+
+static void amdgpu_bo_unmap(struct radeon_winsys_cs_handle *buf)
+{
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
+
+   amdgpu_bo_cpu_unmap(bo->bo);
+}
+
+static void amdgpu_bo_get_base_buffer(struct pb_buffer *buf,
+                                      struct pb_buffer **base_buf,
+                                      unsigned *offset)
+{
+   *base_buf = buf;
+   *offset = 0;
+}
+
+static enum pipe_error amdgpu_bo_validate(struct pb_buffer *_buf,
+                                          struct pb_validate *vl,
+                                          unsigned flags)
+{
+   /* Always pinned */
+   return PIPE_OK;
+}
+
+static void amdgpu_bo_fence(struct pb_buffer *buf,
+                            struct pipe_fence_handle *fence)
+{
+}
+
+static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
+   amdgpu_bo_destroy,
+   NULL, /* never called */
+   NULL, /* never called */
+   amdgpu_bo_validate,
+   amdgpu_bo_fence,
+   amdgpu_bo_get_base_buffer,
+};
+
+static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr,
+                                                pb_size size,
+                                                const struct pb_desc *desc)
+{
+   struct amdgpu_winsys *rws = get_winsys(_mgr);
+   struct amdgpu_bo_desc *rdesc = (struct amdgpu_bo_desc*)desc;
+   struct amdgpu_bo_alloc_request request = {0};
+   amdgpu_bo_handle buf_handle;
+   uint64_t va = 0;
+   struct amdgpu_winsys_bo *bo;
+   amdgpu_va_handle va_handle;
+   int r;
+
+   assert(rdesc->initial_domain & RADEON_DOMAIN_VRAM_GTT);
+   bo = CALLOC_STRUCT(amdgpu_winsys_bo);
+   if (!bo) {
+      return NULL;
+   }
+
+   request.alloc_size = size;
+   request.phys_alignment = desc->alignment;
+
+   if (rdesc->initial_domain & RADEON_DOMAIN_VRAM) {
+      request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
+      if (rdesc->flags & RADEON_FLAG_CPU_ACCESS)
+         request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
+   }
+   if (rdesc->initial_domain & RADEON_DOMAIN_GTT) {
+      request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
+      if (rdesc->flags & RADEON_FLAG_GTT_WC)
+         request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+   }
+
+   r = amdgpu_bo_alloc(rws->dev, &request, &buf_handle);
+   if (r) {
+      fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
+      fprintf(stderr, "amdgpu:    size      : %d bytes\n", size);
+      fprintf(stderr, "amdgpu:    alignment : %d bytes\n", desc->alignment);
+      fprintf(stderr, "amdgpu:    domains   : %d\n", rdesc->initial_domain);
+      goto error_bo_alloc;
+   }
+
+   r = amdgpu_va_range_alloc(rws->dev, amdgpu_gpu_va_range_general,
+                             size, desc->alignment, 0, &va, &va_handle, 0);
+   if (r)
+      goto error_va_alloc;
+
+   r = amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP);
+   if (r)
+      goto error_va_map;
+
+   pipe_reference_init(&bo->base.reference, 1);
+   bo->base.alignment = desc->alignment;
+   bo->base.usage = desc->usage;
+   bo->base.size = size;
+   bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
+   bo->rws = rws;
+   bo->bo = buf_handle;
+   bo->va = va;
+   bo->va_handle = va_handle;
+   bo->initial_domain = rdesc->initial_domain;
+   bo->unique_id = __sync_fetch_and_add(&rws->next_bo_unique_id, 1);
+
+   if (rdesc->initial_domain & RADEON_DOMAIN_VRAM)
+      rws->allocated_vram += align(size, rws->gart_page_size);
+   else if (rdesc->initial_domain & RADEON_DOMAIN_GTT)
+      rws->allocated_gtt += align(size, rws->gart_page_size);
+
+   return &bo->base;
+
+error_va_map:
+   amdgpu_va_range_free(va_handle);
+
+error_va_alloc:
+   amdgpu_bo_free(buf_handle);
+
+error_bo_alloc:
+   FREE(bo);
+   return NULL;
+}
+
+static void amdgpu_bomgr_flush(struct pb_manager *mgr)
+{
+   /* NOP */
+}
+
+/* This is for the cache bufmgr. */
+static boolean amdgpu_bomgr_is_buffer_busy(struct pb_manager *_mgr,
+                                           struct pb_buffer *_buf)
+{
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+
+   if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
+      return TRUE;
+   }
+
+   if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) {
+      return TRUE;
+   }
+
+   return FALSE;
+}
+
+static void amdgpu_bomgr_destroy(struct pb_manager *mgr)
+{
+   FREE(mgr);
+}
+
+struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws)
+{
+   struct amdgpu_bomgr *mgr;
+
+   mgr = CALLOC_STRUCT(amdgpu_bomgr);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = amdgpu_bomgr_destroy;
+   mgr->base.create_buffer = amdgpu_bomgr_create_bo;
+   mgr->base.flush = amdgpu_bomgr_flush;
+   mgr->base.is_buffer_busy = amdgpu_bomgr_is_buffer_busy;
+
+   mgr->rws = rws;
+   return &mgr->base;
+}
+
+static unsigned eg_tile_split(unsigned tile_split)
+{
+   switch (tile_split) {
+   case 0:     tile_split = 64;    break;
+   case 1:     tile_split = 128;   break;
+   case 2:     tile_split = 256;   break;
+   case 3:     tile_split = 512;   break;
+   default:
+   case 4:     tile_split = 1024;  break;
+   case 5:     tile_split = 2048;  break;
+   case 6:     tile_split = 4096;  break;
+   }
+   return tile_split;
+}
+
+static unsigned eg_tile_split_rev(unsigned eg_tile_split)
+{
+   switch (eg_tile_split) {
+   case 64:    return 0;
+   case 128:   return 1;
+   case 256:   return 2;
+   case 512:   return 3;
+   default:
+   case 1024:  return 4;
+   case 2048:  return 5;
+   case 4096:  return 6;
+   }
+}
+
+static void amdgpu_bo_get_tiling(struct pb_buffer *_buf,
+                                 enum radeon_bo_layout *microtiled,
+                                 enum radeon_bo_layout *macrotiled,
+                                 unsigned *bankw, unsigned *bankh,
+                                 unsigned *tile_split,
+                                 unsigned *stencil_tile_split,
+                                 unsigned *mtilea,
+                                 bool *scanout)
+{
+   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
+   struct amdgpu_bo_info info = {0};
+   uint32_t tiling_flags;
+   int r;
+
+   r = amdgpu_bo_query_info(bo->bo, &info);
+   if (r)
+      return;
+
+   tiling_flags = info.metadata.tiling_info;
+
+   *microtiled = RADEON_LAYOUT_LINEAR;
+   *macrotiled = RADEON_LAYOUT_LINEAR;
+
+   if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4)  /* 2D_TILED_THIN1 */
+      *macrotiled = RADEON_LAYOUT_TILED;
+   else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
+      *microtiled = RADEON_LAYOUT_TILED;
+
+   if (bankw && tile_split && mtilea && tile_split) {
+      *bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
+      *bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
+      *tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
+      *mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
+   }
+   if (scanout)
+      *scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
+}
+
+static void amdgpu_bo_set_tiling(struct pb_buffer *_buf,
+                                 struct radeon_winsys_cs *rcs,
+                                 enum radeon_bo_layout microtiled,
+                                 enum radeon_bo_layout macrotiled,
+                                 unsigned pipe_config,
+                                 unsigned bankw, unsigned bankh,
+                                 unsigned tile_split,
+                                 unsigned stencil_tile_split,
+                                 unsigned mtilea, unsigned num_banks,
+                                 uint32_t pitch,
+                                 bool scanout)
+{
+   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
+   struct amdgpu_bo_metadata metadata = {0};
+   uint32_t tiling_flags = 0;
+
+   if (macrotiled == RADEON_LAYOUT_TILED)
+      tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
+   else if (microtiled == RADEON_LAYOUT_TILED)
+      tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
+   else
+      tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
+
+   tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, pipe_config);
+   tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(bankw));
+   tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(bankh));
+   if (tile_split)
+      tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(tile_split));
+   tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(mtilea));
+   tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(num_banks)-1);
+
+   if (scanout)
+      tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
+   else
+      tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
+
+   metadata.tiling_info = tiling_flags;
+
+   amdgpu_bo_set_metadata(bo->bo, &metadata);
+}
+
+static struct radeon_winsys_cs_handle *amdgpu_get_cs_handle(struct pb_buffer *_buf)
+{
+   /* return a direct pointer to amdgpu_winsys_bo. */
+   return (struct radeon_winsys_cs_handle*)get_amdgpu_winsys_bo(_buf);
+}
+
+static struct pb_buffer *
+amdgpu_bo_create(struct radeon_winsys *rws,
+                 unsigned size,
+                 unsigned alignment,
+                 boolean use_reusable_pool,
+                 enum radeon_bo_domain domain,
+                 enum radeon_bo_flag flags)
+{
+   struct amdgpu_winsys *ws = amdgpu_winsys(rws);
+   struct amdgpu_bo_desc desc;
+   struct pb_manager *provider;
+   struct pb_buffer *buffer;
+
+   /* Don't use VRAM if the GPU doesn't have much. This is only the initial
+    * domain. The kernel is free to move the buffer if it wants to.
+    *
+    * 64MB means no VRAM by todays standards.
+    */
+   if (domain & RADEON_DOMAIN_VRAM && ws->info.vram_size <= 64*1024*1024) {
+      domain = RADEON_DOMAIN_GTT;
+      flags = RADEON_FLAG_GTT_WC;
+   }
+
+   memset(&desc, 0, sizeof(desc));
+   desc.base.alignment = alignment;
+
+   /* Align size to page size. This is the minimum alignment for normal
+    * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
+    * like constant/uniform buffers, can benefit from better and more reuse.
+    */
+   size = align(size, ws->gart_page_size);
+
+   /* Only set one usage bit each for domains and flags, or the cache manager
+    * might consider different sets of domains / flags compatible
+    */
+   if (domain == RADEON_DOMAIN_VRAM_GTT)
+      desc.base.usage = 1 << 2;
+   else
+      desc.base.usage = domain >> 1;
+   assert(flags < sizeof(desc.base.usage) * 8 - 3);
+   desc.base.usage |= 1 << (flags + 3);
+
+   desc.initial_domain = domain;
+   desc.flags = flags;
+
+   /* Assign a buffer manager. */
+   if (use_reusable_pool)
+      provider = ws->cman;
+   else
+      provider = ws->kman;
+
+   buffer = provider->create_buffer(provider, size, &desc.base);
+   if (!buffer)
+      return NULL;
+
+   return (struct pb_buffer*)buffer;
+}
+
+static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
+                                               struct winsys_handle *whandle,
+                                               unsigned *stride)
+{
+   struct amdgpu_winsys *ws = amdgpu_winsys(rws);
+   struct amdgpu_winsys_bo *bo;
+   enum amdgpu_bo_handle_type type;
+   struct amdgpu_bo_import_result result = {0};
+   uint64_t va;
+   amdgpu_va_handle va_handle;
+   struct amdgpu_bo_info info = {0};
+   enum radeon_bo_domain initial = 0;
+   int r;
+
+   /* Initialize the structure. */
+   bo = CALLOC_STRUCT(amdgpu_winsys_bo);
+   if (!bo) {
+      return NULL;
+   }
+
+   switch (whandle->type) {
+   case DRM_API_HANDLE_TYPE_SHARED:
+      type = amdgpu_bo_handle_type_gem_flink_name;
+      break;
+   case DRM_API_HANDLE_TYPE_FD:
+      type = amdgpu_bo_handle_type_dma_buf_fd;
+      break;
+   default:
+      return NULL;
+   }
+
+   r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
+   if (r)
+      goto error;
+
+   /* Get initial domains. */
+   r = amdgpu_bo_query_info(result.buf_handle, &info);
+   if (r)
+      goto error_query;
+
+   r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
+                             result.alloc_size, 1 << 20, 0, &va, &va_handle, 0);
+   if (r)
+      goto error_query;
+
+   r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
+   if (r)
+      goto error_va_map;
+
+   if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
+      initial |= RADEON_DOMAIN_VRAM;
+   if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
+      initial |= RADEON_DOMAIN_GTT;
+
+
+   pipe_reference_init(&bo->base.reference, 1);
+   bo->base.alignment = info.phys_alignment;
+   bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ;
+   bo->bo = result.buf_handle;
+   bo->base.size = result.alloc_size;
+   bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
+   bo->rws = ws;
+   bo->va = va;
+   bo->va_handle = va_handle;
+   bo->initial_domain = initial;
+   bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
+   bo->is_shared = true;
+
+   if (stride)
+      *stride = whandle->stride;
+
+   if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+      ws->allocated_vram += align(bo->base.size, ws->gart_page_size);
+   else if (bo->initial_domain & RADEON_DOMAIN_GTT)
+      ws->allocated_gtt += align(bo->base.size, ws->gart_page_size);
+
+   return &bo->base;
+
+error_va_map:
+   amdgpu_va_range_free(va_handle);
+
+error_query:
+   amdgpu_bo_free(result.buf_handle);
+
+error:
+   FREE(bo);
+   return NULL;
+}
+
+static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
+                                    unsigned stride,
+                                    struct winsys_handle *whandle)
+{
+   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(buffer);
+   enum amdgpu_bo_handle_type type;
+   int r;
+
+   switch (whandle->type) {
+   case DRM_API_HANDLE_TYPE_SHARED:
+      type = amdgpu_bo_handle_type_gem_flink_name;
+      break;
+   case DRM_API_HANDLE_TYPE_FD:
+      type = amdgpu_bo_handle_type_dma_buf_fd;
+      break;
+   case DRM_API_HANDLE_TYPE_KMS:
+      type = amdgpu_bo_handle_type_kms;
+      break;
+   default:
+      return FALSE;
+   }
+
+   r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
+   if (r)
+      return FALSE;
+
+   whandle->stride = stride;
+   bo->is_shared = true;
+   return TRUE;
+}
+
+static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
+					    void *pointer, unsigned size)
+{
+    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
+    amdgpu_bo_handle buf_handle;
+    struct amdgpu_winsys_bo *bo;
+    uint64_t va;
+    amdgpu_va_handle va_handle;
+
+    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
+    if (!bo)
+        return NULL;
+
+    if (amdgpu_create_bo_from_user_mem(ws->dev, pointer, size, &buf_handle))
+        goto error;
+
+    if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
+                              size, 1 << 12, 0, &va, &va_handle, 0))
+        goto error_va_alloc;
+
+    if (amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP))
+        goto error_va_map;
+
+    /* Initialize it. */
+    pipe_reference_init(&bo->base.reference, 1);
+    bo->bo = buf_handle;
+    bo->base.alignment = 0;
+    bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ;
+    bo->base.size = size;
+    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
+    bo->rws = ws;
+    bo->user_ptr = pointer;
+    bo->va = va;
+    bo->va_handle = va_handle;
+    bo->initial_domain = RADEON_DOMAIN_GTT;
+    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
+
+    ws->allocated_gtt += align(bo->base.size, ws->gart_page_size);
+
+    return (struct pb_buffer*)bo;
+
+error_va_map:
+    amdgpu_va_range_free(va_handle);
+
+error_va_alloc:
+    amdgpu_bo_free(buf_handle);
+
+error:
+    FREE(bo);
+    return NULL;
+}
+
+static uint64_t amdgpu_bo_get_va(struct radeon_winsys_cs_handle *buf)
+{
+   return ((struct amdgpu_winsys_bo*)buf)->va;
+}
+
+void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws)
+{
+   ws->base.buffer_get_cs_handle = amdgpu_get_cs_handle;
+   ws->base.buffer_set_tiling = amdgpu_bo_set_tiling;
+   ws->base.buffer_get_tiling = amdgpu_bo_get_tiling;
+   ws->base.buffer_map = amdgpu_bo_map;
+   ws->base.buffer_unmap = amdgpu_bo_unmap;
+   ws->base.buffer_wait = amdgpu_bo_wait;
+   ws->base.buffer_create = amdgpu_bo_create;
+   ws->base.buffer_from_handle = amdgpu_bo_from_handle;
+   ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
+   ws->base.buffer_get_handle = amdgpu_bo_get_handle;
+   ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
+   ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
new file mode 100644
index 00000000000..3739fd1366e
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2008 Jérôme Glisse
+ * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#ifndef AMDGPU_BO_H
+#define AMDGPU_BO_H
+
+#include "amdgpu_winsys.h"
+#include "pipebuffer/pb_bufmgr.h"
+
+struct amdgpu_bo_desc {
+   struct pb_desc base;
+
+   enum radeon_bo_domain initial_domain;
+   unsigned flags;
+};
+
+struct amdgpu_winsys_bo {
+   struct pb_buffer base;
+
+   struct amdgpu_winsys *rws;
+   void *user_ptr; /* from buffer_from_ptr */
+
+   amdgpu_bo_handle bo;
+   uint32_t unique_id;
+   amdgpu_va_handle va_handle;
+   uint64_t va;
+   enum radeon_bo_domain initial_domain;
+
+   /* how many command streams is this bo referenced in? */
+   int num_cs_references;
+
+   /* whether buffer_get_handle or buffer_from_handle was called,
+    * it can only transition from false to true
+    */
+   volatile int is_shared; /* bool (int for atomicity) */
+
+   /* Fences for buffer synchronization. */
+   struct pipe_fence_handle *fence[RING_LAST];
+};
+
+struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws);
+void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws);
+
+static inline
+void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst,
+                                struct amdgpu_winsys_bo *src)
+{
+   pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
+}
+
+#endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
new file mode 100644
index 00000000000..0f42298c2ad
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright © 2008 Jérôme Glisse
+ * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "amdgpu_cs.h"
+#include "os/os_time.h"
+#include <stdio.h>
+#include <amdgpu_drm.h>
+
+
+/* FENCES */
+
+static struct pipe_fence_handle *
+amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
+                    unsigned ip_instance, unsigned ring)
+{
+   struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
+
+   fence->reference.count = 1;
+   fence->ctx = ctx;
+   fence->fence.context = ctx->ctx;
+   fence->fence.ip_type = ip_type;
+   fence->fence.ip_instance = ip_instance;
+   fence->fence.ring = ring;
+   p_atomic_inc(&ctx->refcount);
+   return (struct pipe_fence_handle *)fence;
+}
+
+static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
+				struct amdgpu_cs_request* request,
+				uint64_t *user_fence_cpu_address)
+{
+   struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+
+   rfence->fence.fence = request->seq_no;
+   rfence->user_fence_cpu_address = user_fence_cpu_address;
+}
+
+static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
+{
+   struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+
+   rfence->signalled = true;
+}
+
+bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
+                       bool absolute)
+{
+   struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+   uint32_t expired;
+   int64_t abs_timeout;
+   uint64_t *user_fence_cpu;
+   int r;
+
+   if (rfence->signalled)
+      return true;
+
+   if (absolute)
+      abs_timeout = timeout;
+   else
+      abs_timeout = os_time_get_absolute_timeout(timeout);
+
+   user_fence_cpu = rfence->user_fence_cpu_address;
+   if (user_fence_cpu && *user_fence_cpu >= rfence->fence.fence) {
+	rfence->signalled = true;
+	return true;
+   }
+   /* Now use the libdrm query. */
+   r = amdgpu_cs_query_fence_status(&rfence->fence,
+				    abs_timeout,
+				    AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE,
+				    &expired);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_cs_query_fence_status failed.\n");
+      return FALSE;
+   }
+
+   if (expired) {
+      /* This variable can only transition from false to true, so it doesn't
+       * matter if threads race for it. */
+      rfence->signalled = true;
+      return true;
+   }
+   return false;
+}
+
+static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
+                                          struct pipe_fence_handle *fence,
+                                          uint64_t timeout)
+{
+   return amdgpu_fence_wait(fence, timeout, false);
+}
+
+/* CONTEXTS */
+
+static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws)
+{
+   struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
+   int r;
+   struct amdgpu_bo_alloc_request alloc_buffer = {};
+   amdgpu_bo_handle buf_handle;
+
+   ctx->ws = amdgpu_winsys(ws);
+   ctx->refcount = 1;
+
+   r = amdgpu_cs_ctx_create(ctx->ws->dev, &ctx->ctx);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create failed. (%i)\n", r);
+      FREE(ctx);
+      return NULL;
+   }
+
+   alloc_buffer.alloc_size = 4 * 1024;
+   alloc_buffer.phys_alignment = 4 *1024;
+   alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
+
+   r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
+      amdgpu_cs_ctx_free(ctx->ctx);
+      FREE(ctx);
+      return NULL;
+   }
+
+   r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
+      amdgpu_bo_free(buf_handle);
+      amdgpu_cs_ctx_free(ctx->ctx);
+      FREE(ctx);
+      return NULL;
+   }
+
+   memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
+   ctx->user_fence_bo = buf_handle;
+
+   return (struct radeon_winsys_ctx*)ctx;
+}
+
+static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
+{
+   amdgpu_ctx_unref((struct amdgpu_ctx*)rwctx);
+}
+
+static enum pipe_reset_status
+amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx)
+{
+   struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
+   uint32_t result, hangs;
+   int r;
+
+   r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r);
+      return PIPE_NO_RESET;
+   }
+
+   switch (result) {
+   case AMDGPU_CTX_GUILTY_RESET:
+      return PIPE_GUILTY_CONTEXT_RESET;
+   case AMDGPU_CTX_INNOCENT_RESET:
+      return PIPE_INNOCENT_CONTEXT_RESET;
+   case AMDGPU_CTX_UNKNOWN_RESET:
+      return PIPE_UNKNOWN_CONTEXT_RESET;
+   case AMDGPU_CTX_NO_RESET:
+   default:
+      return PIPE_NO_RESET;
+   }
+}
+
+/* COMMAND SUBMISSION */
+
+static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
+{
+   /* The maximum size is 4MB - 1B, which is unaligned.
+    * Use aligned size 4MB - 16B. */
+   const unsigned max_ib_size = (1024 * 1024 - 16) * 4;
+   const unsigned min_ib_size = 24 * 1024 * 4;
+
+   cs->base.cdw = 0;
+   cs->base.buf = NULL;
+
+   /* Allocate a new buffer for IBs if the current buffer is all used. */
+   if (!cs->big_ib_buffer ||
+       cs->used_ib_space + min_ib_size > cs->big_ib_buffer->size) {
+      struct radeon_winsys *ws = &cs->ctx->ws->base;
+      struct radeon_winsys_cs_handle *winsys_bo;
+
+      pb_reference(&cs->big_ib_buffer, NULL);
+      cs->big_ib_winsys_buffer = NULL;
+      cs->ib_mapped = NULL;
+      cs->used_ib_space = 0;
+
+      cs->big_ib_buffer = ws->buffer_create(ws, max_ib_size,
+                                            4096, true,
+                                            RADEON_DOMAIN_GTT,
+                                            RADEON_FLAG_CPU_ACCESS);
+      if (!cs->big_ib_buffer)
+         return false;
+
+      winsys_bo = ws->buffer_get_cs_handle(cs->big_ib_buffer);
+
+      cs->ib_mapped = ws->buffer_map(winsys_bo, NULL, PIPE_TRANSFER_WRITE);
+      if (!cs->ib_mapped) {
+         pb_reference(&cs->big_ib_buffer, NULL);
+         return false;
+      }
+
+      cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)winsys_bo;
+   }
+
+   cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space;
+   cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space);
+   cs->base.max_dw = (cs->big_ib_buffer->size - cs->used_ib_space) / 4;
+   return true;
+}
+
+static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs,
+                                      enum ring_type ring_type)
+{
+   int i;
+
+   switch (ring_type) {
+   case RING_DMA:
+      cs->request.ip_type = AMDGPU_HW_IP_DMA;
+      break;
+
+   case RING_UVD:
+      cs->request.ip_type = AMDGPU_HW_IP_UVD;
+      break;
+
+   case RING_VCE:
+      cs->request.ip_type = AMDGPU_HW_IP_VCE;
+      break;
+
+   case RING_COMPUTE:
+      cs->request.ip_type = AMDGPU_HW_IP_COMPUTE;
+      break;
+
+   default:
+   case RING_GFX:
+      cs->request.ip_type = AMDGPU_HW_IP_GFX;
+      break;
+   }
+
+   cs->request.number_of_ibs = 1;
+   cs->request.ibs = &cs->ib;
+
+   cs->max_num_buffers = 512;
+   cs->buffers = (struct amdgpu_cs_buffer*)
+                  CALLOC(1, cs->max_num_buffers * sizeof(struct amdgpu_cs_buffer));
+   if (!cs->buffers) {
+      return FALSE;
+   }
+
+   cs->handles = CALLOC(1, cs->max_num_buffers * sizeof(amdgpu_bo_handle));
+   if (!cs->handles) {
+      FREE(cs->buffers);
+      return FALSE;
+   }
+
+   cs->flags = CALLOC(1, cs->max_num_buffers);
+   if (!cs->flags) {
+      FREE(cs->handles);
+      FREE(cs->buffers);
+      return FALSE;
+   }
+
+   for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
+      cs->buffer_indices_hashlist[i] = -1;
+   }
+   return TRUE;
+}
+
+static void amdgpu_cs_context_cleanup(struct amdgpu_cs *cs)
+{
+   unsigned i;
+
+   for (i = 0; i < cs->num_buffers; i++) {
+      p_atomic_dec(&cs->buffers[i].bo->num_cs_references);
+      amdgpu_winsys_bo_reference(&cs->buffers[i].bo, NULL);
+      cs->handles[i] = NULL;
+      cs->flags[i] = 0;
+   }
+
+   cs->num_buffers = 0;
+   cs->used_gart = 0;
+   cs->used_vram = 0;
+
+   for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) {
+      cs->buffer_indices_hashlist[i] = -1;
+   }
+}
+
+static void amdgpu_destroy_cs_context(struct amdgpu_cs *cs)
+{
+   amdgpu_cs_context_cleanup(cs);
+   FREE(cs->flags);
+   FREE(cs->buffers);
+   FREE(cs->handles);
+   FREE(cs->request.dependencies);
+}
+
+
+static struct radeon_winsys_cs *
+amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
+                 enum ring_type ring_type,
+                 void (*flush)(void *ctx, unsigned flags,
+                               struct pipe_fence_handle **fence),
+                 void *flush_ctx,
+                 struct radeon_winsys_cs_handle *trace_buf)
+{
+   struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
+   struct amdgpu_cs *cs;
+
+   cs = CALLOC_STRUCT(amdgpu_cs);
+   if (!cs) {
+      return NULL;
+   }
+
+   cs->ctx = ctx;
+   cs->flush_cs = flush;
+   cs->flush_data = flush_ctx;
+   cs->base.ring_type = ring_type;
+
+   if (!amdgpu_init_cs_context(cs, ring_type)) {
+      FREE(cs);
+      return NULL;
+   }
+
+   if (!amdgpu_get_new_ib(cs)) {
+      amdgpu_destroy_cs_context(cs);
+      FREE(cs);
+      return NULL;
+   }
+
+   p_atomic_inc(&ctx->ws->num_cs);
+   return &cs->base;
+}
+
+#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
+
+int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
+{
+   unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
+   int i = cs->buffer_indices_hashlist[hash];
+
+   /* not found or found */
+   if (i == -1 || cs->buffers[i].bo == bo)
+      return i;
+
+   /* Hash collision, look for the BO in the list of relocs linearly. */
+   for (i = cs->num_buffers - 1; i >= 0; i--) {
+      if (cs->buffers[i].bo == bo) {
+         /* Put this reloc in the hash list.
+          * This will prevent additional hash collisions if there are
+          * several consecutive get_reloc calls for the same buffer.
+          *
+          * Example: Assuming buffers A,B,C collide in the hash list,
+          * the following sequence of relocs:
+          *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
+          * will collide here: ^ and here:   ^,
+          * meaning that we should get very few collisions in the end. */
+         cs->buffer_indices_hashlist[hash] = i;
+         return i;
+      }
+   }
+   return -1;
+}
+
+static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs,
+                                 struct amdgpu_winsys_bo *bo,
+                                 enum radeon_bo_usage usage,
+                                 enum radeon_bo_domain domains,
+                                 unsigned priority,
+                                 enum radeon_bo_domain *added_domains)
+{
+   struct amdgpu_cs_buffer *reloc;
+   unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
+   int i = -1;
+
+   priority = MIN2(priority, 15);
+   *added_domains = 0;
+
+   i = amdgpu_get_reloc(cs, bo);
+
+   if (i >= 0) {
+      reloc = &cs->buffers[i];
+      reloc->usage |= usage;
+      *added_domains = domains & ~reloc->domains;
+      reloc->domains |= domains;
+      cs->flags[i] = MAX2(cs->flags[i], priority);
+      return i;
+   }
+
+   /* New relocation, check if the backing array is large enough. */
+   if (cs->num_buffers >= cs->max_num_buffers) {
+      uint32_t size;
+      cs->max_num_buffers += 10;
+
+      size = cs->max_num_buffers * sizeof(struct amdgpu_cs_buffer);
+      cs->buffers = realloc(cs->buffers, size);
+
+      size = cs->max_num_buffers * sizeof(amdgpu_bo_handle);
+      cs->handles = realloc(cs->handles, size);
+
+      cs->flags = realloc(cs->flags, cs->max_num_buffers);
+   }
+
+   /* Initialize the new relocation. */
+   cs->buffers[cs->num_buffers].bo = NULL;
+   amdgpu_winsys_bo_reference(&cs->buffers[cs->num_buffers].bo, bo);
+   cs->handles[cs->num_buffers] = bo->bo;
+   cs->flags[cs->num_buffers] = priority;
+   p_atomic_inc(&bo->num_cs_references);
+   reloc = &cs->buffers[cs->num_buffers];
+   reloc->bo = bo;
+   reloc->usage = usage;
+   reloc->domains = domains;
+
+   cs->buffer_indices_hashlist[hash] = cs->num_buffers;
+
+   *added_domains = domains;
+   return cs->num_buffers++;
+}
+
+static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs,
+                                    struct radeon_winsys_cs_handle *buf,
+                                    enum radeon_bo_usage usage,
+                                    enum radeon_bo_domain domains,
+                                    enum radeon_bo_priority priority)
+{
+   /* Don't use the "domains" parameter. Amdgpu doesn't support changing
+    * the buffer placement during command submission.
+    */
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
+   enum radeon_bo_domain added_domains;
+   unsigned index = amdgpu_add_reloc(cs, bo, usage, bo->initial_domain,
+                                     priority, &added_domains);
+
+   if (added_domains & RADEON_DOMAIN_GTT)
+      cs->used_gart += bo->base.size;
+   if (added_domains & RADEON_DOMAIN_VRAM)
+      cs->used_vram += bo->base.size;
+
+   return index;
+}
+
+static int amdgpu_cs_get_reloc(struct radeon_winsys_cs *rcs,
+                               struct radeon_winsys_cs_handle *buf)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+
+   return amdgpu_get_reloc(cs, (struct amdgpu_winsys_bo*)buf);
+}
+
+static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
+{
+   return TRUE;
+}
+
+static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+   boolean status =
+         (cs->used_gart + gtt) < cs->ctx->ws->info.gart_size * 0.7 &&
+         (cs->used_vram + vram) < cs->ctx->ws->info.vram_size * 0.7;
+
+   return status;
+}
+
+static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
+                                    struct pipe_fence_handle **out_fence)
+{
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+   struct pipe_fence_handle *fence;
+   int i, j, r;
+
+   /* Create a fence. */
+   fence = amdgpu_fence_create(cs->ctx,
+                               cs->request.ip_type,
+                               cs->request.ip_instance,
+                               cs->request.ring);
+   if (out_fence)
+      amdgpu_fence_reference(out_fence, fence);
+
+   cs->request.number_of_dependencies = 0;
+
+   /* Since the kernel driver doesn't synchronize execution between different
+    * rings automatically, we have to add fence dependencies manually. */
+   pipe_mutex_lock(ws->bo_fence_lock);
+   for (i = 0; i < cs->num_buffers; i++) {
+      for (j = 0; j < RING_LAST; j++) {
+         struct amdgpu_cs_fence *dep;
+         unsigned idx;
+
+         struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence[j];
+         if (!bo_fence)
+            continue;
+
+         if (bo_fence->ctx == cs->ctx &&
+             bo_fence->fence.ip_type == cs->request.ip_type &&
+             bo_fence->fence.ip_instance == cs->request.ip_instance &&
+             bo_fence->fence.ring == cs->request.ring)
+            continue;
+
+         if (amdgpu_fence_wait((void *)bo_fence, 0, false))
+            continue;
+
+         idx = cs->request.number_of_dependencies++;
+         if (idx >= cs->max_dependencies) {
+            unsigned size;
+
+            cs->max_dependencies = idx + 8;
+            size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence);
+            cs->request.dependencies = realloc(cs->request.dependencies, size);
+         }
+
+         dep = &cs->request.dependencies[idx];
+         memcpy(dep, &bo_fence->fence, sizeof(*dep));
+      }
+   }
+
+   cs->request.fence_info.handle = NULL;
+   if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE) {
+	cs->request.fence_info.handle = cs->ctx->user_fence_bo;
+	cs->request.fence_info.offset = cs->base.ring_type;
+   }
+
+   r = amdgpu_cs_submit(cs->ctx->ctx, 0, &cs->request, 1);
+   if (r) {
+      if (r == -ENOMEM)
+         fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
+      else
+         fprintf(stderr, "amdgpu: The CS has been rejected, "
+                 "see dmesg for more information.\n");
+
+      amdgpu_fence_signalled(fence);
+   } else {
+      /* Success. */
+      uint64_t *user_fence = NULL;
+      if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE)
+         user_fence = cs->ctx->user_fence_cpu_address_base +
+                      cs->request.fence_info.offset;
+      amdgpu_fence_submitted(fence, &cs->request, user_fence);
+
+      for (i = 0; i < cs->num_buffers; i++)
+         amdgpu_fence_reference(&cs->buffers[i].bo->fence[cs->base.ring_type],
+                                fence);
+   }
+   pipe_mutex_unlock(ws->bo_fence_lock);
+   amdgpu_fence_reference(&fence, NULL);
+}
+
+static void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
+{
+   /* no-op */
+}
+
+DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
+
+static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
+                            unsigned flags,
+                            struct pipe_fence_handle **fence,
+                            uint32_t cs_trace_id)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+   struct amdgpu_winsys *ws = cs->ctx->ws;
+
+   switch (cs->base.ring_type) {
+   case RING_DMA:
+      /* pad DMA ring to 8 DWs */
+      if (ws->info.chip_class <= SI) {
+         while (rcs->cdw & 7)
+            OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
+      } else {
+         while (rcs->cdw & 7)
+            OUT_CS(&cs->base, 0x00000000); /* NOP packet */
+      }
+      break;
+   case RING_GFX:
+      /* pad DMA ring to 8 DWs to meet CP fetch alignment requirements
+             * r6xx, requires at least 4 dw alignment to avoid a hw bug.
+             */
+      if (ws->info.chip_class <= SI) {
+         while (rcs->cdw & 7)
+            OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
+      } else {
+         while (rcs->cdw & 7)
+            OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
+      }
+      break;
+   case RING_UVD:
+      while (rcs->cdw & 15)
+         OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
+      break;
+   default:
+      break;
+   }
+
+   if (rcs->cdw > rcs->max_dw) {
+      fprintf(stderr, "amdgpu: command stream overflowed\n");
+   }
+
+   amdgpu_cs_add_reloc(rcs, (void*)cs->big_ib_winsys_buffer,
+		       RADEON_USAGE_READ, 0, RADEON_PRIO_MIN);
+
+   /* If the CS is not empty or overflowed.... */
+   if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
+      int r;
+
+      r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
+                                cs->handles, cs->flags,
+                                &cs->request.resources);
+
+      if (r) {
+         fprintf(stderr, "amdgpu: resource list creation failed (%d)\n", r);
+         cs->request.resources = NULL;
+	 goto cleanup;
+      }
+
+      cs->ib.size = cs->base.cdw;
+      cs->used_ib_space += cs->base.cdw * 4;
+
+      amdgpu_cs_do_submission(cs, fence);
+
+      /* Cleanup. */
+      if (cs->request.resources)
+         amdgpu_bo_list_destroy(cs->request.resources);
+   }
+
+cleanup:
+   amdgpu_cs_context_cleanup(cs);
+   amdgpu_get_new_ib(cs);
+
+   ws->num_cs_flushes++;
+}
+
+static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+
+   amdgpu_destroy_cs_context(cs);
+   p_atomic_dec(&cs->ctx->ws->num_cs);
+   pb_reference(&cs->big_ib_buffer, NULL);
+   FREE(cs);
+}
+
+static boolean amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs,
+                                       struct radeon_winsys_cs_handle *_buf,
+                                       enum radeon_bo_usage usage)
+{
+   struct amdgpu_cs *cs = amdgpu_cs(rcs);
+   struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
+
+   return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
+}
+
+void amdgpu_cs_init_functions(struct amdgpu_winsys *ws)
+{
+   ws->base.ctx_create = amdgpu_ctx_create;
+   ws->base.ctx_destroy = amdgpu_ctx_destroy;
+   ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
+   ws->base.cs_create = amdgpu_cs_create;
+   ws->base.cs_destroy = amdgpu_cs_destroy;
+   ws->base.cs_add_reloc = amdgpu_cs_add_reloc;
+   ws->base.cs_get_reloc = amdgpu_cs_get_reloc;
+   ws->base.cs_validate = amdgpu_cs_validate;
+   ws->base.cs_memory_below_limit = amdgpu_cs_memory_below_limit;
+   ws->base.cs_flush = amdgpu_cs_flush;
+   ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
+   ws->base.cs_sync_flush = amdgpu_cs_sync_flush;
+   ws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
+   ws->base.fence_reference = amdgpu_fence_reference;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
new file mode 100644
index 00000000000..0842259044b
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#ifndef AMDGPU_CS_H
+#define AMDGPU_CS_H
+
+#include "amdgpu_bo.h"
+#include "util/u_memory.h"
+
+struct amdgpu_ctx {
+   struct amdgpu_winsys *ws;
+   amdgpu_context_handle ctx;
+   amdgpu_bo_handle user_fence_bo;
+   uint64_t *user_fence_cpu_address_base;
+   int refcount;
+};
+
+struct amdgpu_cs_buffer {
+   struct amdgpu_winsys_bo *bo;
+   enum radeon_bo_usage usage;
+   enum radeon_bo_domain domains;
+};
+
+
+struct amdgpu_cs {
+   struct radeon_winsys_cs base;
+   struct amdgpu_ctx *ctx;
+
+   /* Flush CS. */
+   void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle **fence);
+   void *flush_data;
+
+   /* A buffer out of which new IBs are allocated. */
+   struct pb_buffer *big_ib_buffer; /* for holding the reference */
+   struct amdgpu_winsys_bo *big_ib_winsys_buffer;
+   uint8_t *ib_mapped;
+   unsigned used_ib_space;
+
+   /* amdgpu_cs_submit parameters */
+   struct amdgpu_cs_request    request;
+   struct amdgpu_cs_ib_info    ib;
+
+   /* Relocs. */
+   unsigned                    max_num_buffers;
+   unsigned                    num_buffers;
+   amdgpu_bo_handle            *handles;
+   uint8_t                     *flags;
+   struct amdgpu_cs_buffer     *buffers;
+
+   int                         buffer_indices_hashlist[512];
+
+   unsigned                    used_vram;
+   unsigned                    used_gart;
+
+   unsigned                    max_dependencies;
+};
+
+struct amdgpu_fence {
+   struct pipe_reference reference;
+
+   struct amdgpu_ctx *ctx;  /* submission context */
+   struct amdgpu_cs_fence fence;
+   uint64_t *user_fence_cpu_address;
+
+   volatile int signalled;              /* bool (int for atomicity) */
+};
+
+static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx)
+{
+   if (p_atomic_dec_zero(&ctx->refcount)) {
+      amdgpu_cs_ctx_free(ctx->ctx);
+      amdgpu_bo_free(ctx->user_fence_bo);
+      FREE(ctx);
+   }
+}
+
+static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst,
+                                          struct pipe_fence_handle *src)
+{
+   struct amdgpu_fence **rdst = (struct amdgpu_fence **)dst;
+   struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src;
+
+   if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+      amdgpu_ctx_unref((*rdst)->ctx);
+      FREE(*rdst);
+   }
+   *rdst = rsrc;
+}
+
+int amdgpu_get_reloc(struct amdgpu_cs *csc, struct amdgpu_winsys_bo *bo);
+
+static inline struct amdgpu_cs *
+amdgpu_cs(struct radeon_winsys_cs *base)
+{
+   return (struct amdgpu_cs*)base;
+}
+
+static inline boolean
+amdgpu_bo_is_referenced_by_cs(struct amdgpu_cs *cs,
+                              struct amdgpu_winsys_bo *bo)
+{
+   int num_refs = bo->num_cs_references;
+   return num_refs == bo->rws->num_cs ||
+         (num_refs && amdgpu_get_reloc(cs, bo) != -1);
+}
+
+static inline boolean
+amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs,
+                                         struct amdgpu_winsys_bo *bo,
+                                         enum radeon_bo_usage usage)
+{
+   int index;
+
+   if (!bo->num_cs_references)
+      return FALSE;
+
+   index = amdgpu_get_reloc(cs, bo);
+   if (index == -1)
+      return FALSE;
+
+   return (cs->buffers[index].usage & usage) != 0;
+}
+
+static inline boolean
+amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo)
+{
+   return bo->num_cs_references != 0;
+}
+
+bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
+                       bool absolute);
+void amdgpu_cs_init_functions(struct amdgpu_winsys *ws);
+
+#endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
new file mode 100644
index 00000000000..8882c418e12
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/**
+ * This file is included by addrlib. It adds GPU family definitions and
+ * macros compatible with addrlib.
+ */
+
+#ifndef AMDGPU_ID_H
+#define AMDGPU_ID_H
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_LITTLE_ENDIAN)
+#define LITTLEENDIAN_CPU
+#elif defined(PIPE_ARCH_BIG_ENDIAN)
+#define BIGENDIAN_CPU
+#endif
+
+enum {
+	FAMILY_UNKNOWN,
+	FAMILY_SI,
+	FAMILY_CI,
+	FAMILY_KV,
+	FAMILY_VI,
+	FAMILY_CZ,
+	FAMILY_PI,
+	FAMILY_LAST,
+};
+
+/* SI specific rev IDs */
+enum {
+	SI_TAHITI_P_A11      = 1,
+	SI_TAHITI_P_A0       = SI_TAHITI_P_A11,      /*A0 is alias of A11*/
+	SI_TAHITI_P_A21      = 5,
+	SI_TAHITI_P_B0       = SI_TAHITI_P_A21,      /*B0 is alias of A21*/
+	SI_TAHITI_P_A22      = 6,
+	SI_TAHITI_P_B1       = SI_TAHITI_P_A22,      /*B1 is alias of A22*/
+
+	SI_PITCAIRN_PM_A11   = 20,
+	SI_PITCAIRN_PM_A0    = SI_PITCAIRN_PM_A11,   /*A0 is alias of A11*/
+	SI_PITCAIRN_PM_A12   = 21,
+	SI_PITCAIRN_PM_A1    = SI_PITCAIRN_PM_A12,   /*A1 is alias of A12*/
+
+	SI_CAPEVERDE_M_A11   = 40,
+	SI_CAPEVERDE_M_A0    = SI_CAPEVERDE_M_A11,   /*A0 is alias of A11*/
+	SI_CAPEVERDE_M_A12   = 41,
+	SI_CAPEVERDE_M_A1    = SI_CAPEVERDE_M_A12,   /*A1 is alias of A12*/
+
+	SI_OLAND_M_A0        = 60,
+
+	SI_HAINAN_V_A0       = 70,
+
+	SI_UNKNOWN           = 0xFF
+};
+
+
+#define ASICREV_IS_TAHITI_P(eChipRev)	\
+	(eChipRev < SI_PITCAIRN_PM_A11)
+#define ASICREV_IS_PITCAIRN_PM(eChipRev)	\
+	((eChipRev >= SI_PITCAIRN_PM_A11) && (eChipRev < SI_CAPEVERDE_M_A11))
+#define ASICREV_IS_CAPEVERDE_M(eChipRev)	\
+	((eChipRev >= SI_CAPEVERDE_M_A11) && (eChipRev < SI_OLAND_M_A0))
+#define ASICREV_IS_OLAND_M(eChipRev)	\
+	((eChipRev >= SI_OLAND_M_A0) && (eChipRev < SI_HAINAN_V_A0))
+#define ASICREV_IS_HAINAN_V(eChipRev)	\
+(eChipRev >= SI_HAINAN_V_A0)
+
+/* CI specific revIDs */
+enum {
+	CI_BONAIRE_M_A0 = 20,
+	CI_BONAIRE_M_A1 = 21,
+
+	CI_HAWAII_P_A0  = 40,
+
+	CI_UNKNOWN      = 0xFF
+};
+
+#define ASICREV_IS_BONAIRE_M(eChipRev)	\
+	((eChipRev >= CI_BONAIRE_M_A0) && (eChipRev < CI_HAWAII_P_A0))
+#define ASICREV_IS_HAWAII_P(eChipRev)	\
+	(eChipRev >= CI_HAWAII_P_A0)
+
+/* KV specific rev IDs */
+enum {
+	KV_SPECTRE_A0      = 0x01,       /* KV1 with Spectre GFX core, 8-8-1-2 (CU-Pix-Primitive-RB) */
+	KV_SPOOKY_A0       = 0x41,       /* KV2 with Spooky GFX core, including downgraded from Spectre core, 3-4-1-1 (CU-Pix-Primitive-RB) */
+	KB_KALINDI_A0      = 0x81,       /* KB with Kalindi GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	KB_KALINDI_A1      = 0x82,       /* KB with Kalindi GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	BV_KALINDI_A2      = 0x85,       /* BV with Kalindi GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	ML_GODAVARI_A0     = 0xa1,      /* ML with Godavari GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	ML_GODAVARI_A1     = 0xa2,      /* ML with Godavari GFX core, 2-4-1-1 (CU-Pix-Primitive-RB) */
+	KV_UNKNOWN = 0xFF
+};
+
+#define ASICREV_IS_SPECTRE(eChipRev)	\
+	((eChipRev >= KV_SPECTRE_A0) && (eChipRev < KV_SPOOKY_A0))         /* identify all versions of SPRECTRE and supported features set */
+#define ASICREV_IS_SPOOKY(eChipRev)	\
+	((eChipRev >= KV_SPOOKY_A0) && (eChipRev < KB_KALINDI_A0))          /* identify all versions of SPOOKY and supported features set */
+#define ASICREV_IS_KALINDI(eChipRev)	\
+	((eChipRev >= KB_KALINDI_A0) && (eChipRev < KV_UNKNOWN))           /* identify all versions of KALINDI and supported features set */
+
+/* Following macros are subset of ASICREV_IS_KALINDI macro */
+#define ASICREV_IS_KALINDI_BHAVANI(eChipRev)	\
+	((eChipRev >= BV_KALINDI_A2) && (eChipRev < ML_GODAVARI_A0))   /* identify all versions of BHAVANI and supported features set */
+#define ASICREV_IS_KALINDI_GODAVARI(eChipRev)	\
+	((eChipRev >= ML_GODAVARI_A0) && (eChipRev < KV_UNKNOWN)) /* identify all versions of GODAVARI and supported features set */
+
+/* VI specific rev IDs */
+enum {
+	VI_ICELAND_M_A0   = 1,
+
+	VI_TONGA_P_A0     = 20,
+	VI_TONGA_P_A1     = 21,
+
+	VI_FIJI_P_A0      = 60,
+
+	VI_UNKNOWN        = 0xFF
+};
+
+
+#define ASICREV_IS_ICELAND_M(eChipRev)	\
+	(eChipRev < VI_TONGA_P_A0)
+#define ASICREV_IS_TONGA_P(eChipRev)	\
+	((eChipRev >= VI_TONGA_P_A0) && (eChipRev < VI_FIJI_P_A0))
+#define ASICREV_IS_FIJI_P(eChipRev)	\
+	(eChipRev >= VI_FIJI_P_A0)
+
+/* CZ specific rev IDs */
+enum {
+	CZ_CARRIZO_A0      = 0x01,
+	CZ_UNKNOWN      = 0xFF
+};
+
+#define ASICREV_IS_CARRIZO(eChipRev) \
+	(eChipRev >= CARRIZO_A0)
+
+#endif /* AMDGPU_ID_H */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_public.h b/src/gallium/winsys/amdgpu/drm/amdgpu_public.h
new file mode 100644
index 00000000000..ad133b20bf6
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_public.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+#ifndef AMDGPU_PUBLIC_H
+#define AMDGPU_PUBLIC_H
+
+#include "pipe/p_defines.h"
+
+struct radeon_winsys;
+struct pipe_screen;
+
+typedef struct pipe_screen *(*radeon_screen_create_t)(struct radeon_winsys *);
+
+struct radeon_winsys *
+amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create);
+
+#endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
new file mode 100644
index 00000000000..358df381011
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright © 2011 Red Hat All Rights Reserved.
+ * Copyright © 2014 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+/* Contact:
+ *     Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "amdgpu_winsys.h"
+
+#ifndef NO_ENTRIES
+#define NO_ENTRIES 32
+#endif
+
+#ifndef NO_MACRO_ENTRIES
+#define NO_MACRO_ENTRIES 16
+#endif
+
+#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND
+#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A
+#endif
+
+
+static int amdgpu_surface_sanity(const struct radeon_surf *surf)
+{
+   unsigned type = RADEON_SURF_GET(surf->flags, TYPE);
+
+   if (!(surf->flags & RADEON_SURF_HAS_TILE_MODE_INDEX))
+      return -EINVAL;
+
+   /* all dimension must be at least 1 ! */
+   if (!surf->npix_x || !surf->npix_y || !surf->npix_z ||
+       !surf->array_size)
+      return -EINVAL;
+
+   if (!surf->blk_w || !surf->blk_h || !surf->blk_d)
+      return -EINVAL;
+
+   switch (surf->nsamples) {
+   case 1:
+   case 2:
+   case 4:
+   case 8:
+      break;
+   default:
+      return -EINVAL;
+   }
+
+   switch (type) {
+   case RADEON_SURF_TYPE_1D:
+      if (surf->npix_y > 1)
+         return -EINVAL;
+      /* fall through */
+   case RADEON_SURF_TYPE_2D:
+   case RADEON_SURF_TYPE_CUBEMAP:
+      if (surf->npix_z > 1 || surf->array_size > 1)
+         return -EINVAL;
+      break;
+   case RADEON_SURF_TYPE_3D:
+      if (surf->array_size > 1)
+         return -EINVAL;
+      break;
+   case RADEON_SURF_TYPE_1D_ARRAY:
+      if (surf->npix_y > 1)
+         return -EINVAL;
+      /* fall through */
+   case RADEON_SURF_TYPE_2D_ARRAY:
+      if (surf->npix_z > 1)
+         return -EINVAL;
+      break;
+   default:
+      return -EINVAL;
+   }
+   return 0;
+}
+
+static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput)
+{
+   return malloc(pInput->sizeInBytes);
+}
+
+static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput)
+{
+   free(pInput->pVirtAddr);
+   return ADDR_OK;
+}
+
+/**
+ * This returns the number of banks for the surface.
+ * Possible values: 2, 4, 8, 16.
+ */
+static uint32_t cik_num_banks(struct amdgpu_winsys *ws,
+                              struct radeon_surf *surf)
+{
+   unsigned index, tileb;
+
+   tileb = 8 * 8 * surf->bpe;
+   tileb = MIN2(surf->tile_split, tileb);
+
+   for (index = 0; tileb > 64; index++) {
+      tileb >>= 1;
+   }
+   assert(index < 16);
+
+   return 2 << ((ws->amdinfo.gb_macro_tile_mode[index] >> 6) & 0x3);
+}
+
+ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
+{
+   ADDR_CREATE_INPUT addrCreateInput = {0};
+   ADDR_CREATE_OUTPUT addrCreateOutput = {0};
+   ADDR_REGISTER_VALUE regValue = {0};
+   ADDR_CREATE_FLAGS createFlags = {{0}};
+   ADDR_E_RETURNCODE addrRet;
+
+   addrCreateInput.size = sizeof(ADDR_CREATE_INPUT);
+   addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT);
+
+   regValue.noOfBanks = ws->amdinfo.mc_arb_ramcfg & 0x3;
+   regValue.gbAddrConfig = ws->amdinfo.gb_addr_cfg;
+   regValue.noOfRanks = (ws->amdinfo.mc_arb_ramcfg & 0x4) >> 2;
+
+   regValue.backendDisables = ws->amdinfo.backend_disable[0];
+   regValue.pTileConfig = ws->amdinfo.gb_tile_mode;
+   regValue.noOfEntries = sizeof(ws->amdinfo.gb_tile_mode) /
+                          sizeof(ws->amdinfo.gb_tile_mode[0]);
+   regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode;
+   regValue.noOfMacroEntries = sizeof(ws->amdinfo.gb_macro_tile_mode) /
+                               sizeof(ws->amdinfo.gb_macro_tile_mode[0]);
+
+   createFlags.value = 0;
+   createFlags.useTileIndex = 1;
+   createFlags.degradeBaseLevel = 1;
+
+   addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
+   addrCreateInput.chipFamily = ws->family;
+   addrCreateInput.chipRevision = ws->rev_id;
+   addrCreateInput.createFlags = createFlags;
+   addrCreateInput.callbacks.allocSysMem = allocSysMem;
+   addrCreateInput.callbacks.freeSysMem = freeSysMem;
+   addrCreateInput.callbacks.debugPrint = 0;
+   addrCreateInput.regValue = regValue;
+
+   addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput);
+   if (addrRet != ADDR_OK)
+      return NULL;
+
+   return addrCreateOutput.hLib;
+}
+
+static int compute_level(struct amdgpu_winsys *ws,
+                         struct radeon_surf *surf, bool is_stencil,
+                         unsigned level, unsigned type, bool compressed,
+                         ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
+                         ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut)
+{
+   struct radeon_surf_level *surf_level;
+   ADDR_E_RETURNCODE ret;
+
+   AddrSurfInfoIn->mipLevel = level;
+   AddrSurfInfoIn->width = u_minify(surf->npix_x, level);
+   AddrSurfInfoIn->height = u_minify(surf->npix_y, level);
+
+   if (type == RADEON_SURF_TYPE_3D)
+      AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level);
+   else if (type == RADEON_SURF_TYPE_CUBEMAP)
+      AddrSurfInfoIn->numSlices = 6;
+   else
+      AddrSurfInfoIn->numSlices = surf->array_size;
+
+   if (level > 0) {
+      /* Set the base level pitch. This is needed for calculation
+       * of non-zero levels. */
+      if (is_stencil)
+         AddrSurfInfoIn->basePitch = surf->stencil_level[0].nblk_x;
+      else
+         AddrSurfInfoIn->basePitch = surf->level[0].nblk_x;
+
+      /* Convert blocks to pixels for compressed formats. */
+      if (compressed)
+         AddrSurfInfoIn->basePitch *= surf->blk_w;
+   }
+
+   ret = AddrComputeSurfaceInfo(ws->addrlib,
+                                AddrSurfInfoIn,
+                                AddrSurfInfoOut);
+   if (ret != ADDR_OK) {
+      return ret;
+   }
+
+   surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level];
+   surf_level->offset = align(surf->bo_size, AddrSurfInfoOut->baseAlign);
+   surf_level->slice_size = AddrSurfInfoOut->sliceSize;
+   surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe);
+   surf_level->npix_x = u_minify(surf->npix_x, level);
+   surf_level->npix_y = u_minify(surf->npix_y, level);
+   surf_level->npix_z = u_minify(surf->npix_z, level);
+   surf_level->nblk_x = AddrSurfInfoOut->pitch;
+   surf_level->nblk_y = AddrSurfInfoOut->height;
+   if (type == RADEON_SURF_TYPE_3D)
+      surf_level->nblk_z = AddrSurfInfoOut->depth;
+   else
+      surf_level->nblk_z = 1;
+
+   switch (AddrSurfInfoOut->tileMode) {
+   case ADDR_TM_LINEAR_GENERAL:
+      surf_level->mode = RADEON_SURF_MODE_LINEAR;
+      break;
+   case ADDR_TM_LINEAR_ALIGNED:
+      surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+      break;
+   case ADDR_TM_1D_TILED_THIN1:
+      surf_level->mode = RADEON_SURF_MODE_1D;
+      break;
+   case ADDR_TM_2D_TILED_THIN1:
+      surf_level->mode = RADEON_SURF_MODE_2D;
+      break;
+   default:
+      assert(0);
+   }
+
+   if (is_stencil)
+      surf->stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex;
+   else
+      surf->tiling_index[level] = AddrSurfInfoOut->tileIndex;
+
+   surf->bo_size = surf_level->offset + AddrSurfInfoOut->surfSize;
+   return 0;
+}
+
+static int amdgpu_surface_init(struct radeon_winsys *rws,
+                               struct radeon_surf *surf)
+{
+   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
+   unsigned level, mode, type;
+   bool compressed;
+   ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
+   ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
+   ADDR_TILEINFO AddrTileInfoIn = {0};
+   ADDR_TILEINFO AddrTileInfoOut = {0};
+   int r;
+
+   r = amdgpu_surface_sanity(surf);
+   if (r)
+      return r;
+
+   AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
+   AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
+   AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
+
+   type = RADEON_SURF_GET(surf->flags, TYPE);
+   mode = RADEON_SURF_GET(surf->flags, MODE);
+   compressed = surf->blk_w == 4 && surf->blk_h == 4;
+
+   /* MSAA and FMASK require 2D tiling. */
+   if (surf->nsamples > 1 ||
+       (surf->flags & RADEON_SURF_FMASK))
+      mode = RADEON_SURF_MODE_2D;
+
+   /* DB doesn't support linear layouts. */
+   if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) &&
+       mode < RADEON_SURF_MODE_1D)
+      mode = RADEON_SURF_MODE_1D;
+
+   /* Set the requested tiling mode. */
+   switch (mode) {
+   case RADEON_SURF_MODE_LINEAR:
+      AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_GENERAL;
+      break;
+   case RADEON_SURF_MODE_LINEAR_ALIGNED:
+      AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED;
+      break;
+   case RADEON_SURF_MODE_1D:
+      AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1;
+      break;
+   case RADEON_SURF_MODE_2D:
+      AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1;
+      break;
+   default:
+      assert(0);
+   }
+
+   /* The format must be set correctly for the allocation of compressed
+    * textures to work. In other cases, setting the bpp is sufficient. */
+   if (compressed) {
+      switch (surf->bpe) {
+      case 8:
+         AddrSurfInfoIn.format = ADDR_FMT_BC1;
+         break;
+      case 16:
+         AddrSurfInfoIn.format = ADDR_FMT_BC3;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   else {
+      AddrSurfInfoIn.bpp = surf->bpe * 8;
+   }
+
+   AddrSurfInfoIn.numSamples = surf->nsamples;
+   AddrSurfInfoIn.tileIndex = -1;
+
+   /* Set the micro tile type. */
+   if (surf->flags & RADEON_SURF_SCANOUT)
+      AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
+   else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
+      AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
+   else
+      AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
+
+   AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
+   AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
+   AddrSurfInfoIn.flags.stencil = (surf->flags & RADEON_SURF_SBUFFER) != 0;
+   AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP;
+   AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
+   AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
+   AddrSurfInfoIn.flags.degrade4Space = 1;
+
+   /* This disables incorrect calculations (hacks) in addrlib. */
+   AddrSurfInfoIn.flags.noStencil = 1;
+
+   /* Set preferred macrotile parameters. This is usually required
+    * for shared resources. This is for 2D tiling only. */
+   if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 &&
+       surf->bankw && surf->bankh && surf->mtilea && surf->tile_split) {
+      /* If any of these parameters are incorrect, the calculation
+       * will fail. */
+      AddrTileInfoIn.banks = cik_num_banks(ws, surf);
+      AddrTileInfoIn.bankWidth = surf->bankw;
+      AddrTileInfoIn.bankHeight = surf->bankh;
+      AddrTileInfoIn.macroAspectRatio = surf->mtilea;
+      AddrTileInfoIn.tileSplitBytes = surf->tile_split;
+      AddrSurfInfoIn.flags.degrade4Space = 0;
+      AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn;
+
+      /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set
+       * the tile index, because we are expected to know it if
+       * we know the other parameters.
+       *
+       * This is something that can easily be fixed in Addrlib.
+       * For now, just figure it out here.
+       * Note that only 2D_TILE_THIN1 is handled here.
+       */
+      assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+      assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1);
+
+      if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
+         AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
+      else
+         AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
+   }
+
+   surf->bo_size = 0;
+
+   /* Calculate texture layout information. */
+   for (level = 0; level <= surf->last_level; level++) {
+      r = compute_level(ws, surf, false, level, type, compressed,
+                        &AddrSurfInfoIn, &AddrSurfInfoOut);
+      if (r)
+         return r;
+
+      if (level == 0) {
+         surf->bo_alignment = AddrSurfInfoOut.baseAlign;
+         surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1;
+
+         /* For 2D modes only. */
+         if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
+            surf->bankw = AddrSurfInfoOut.pTileInfo->bankWidth;
+            surf->bankh = AddrSurfInfoOut.pTileInfo->bankHeight;
+            surf->mtilea = AddrSurfInfoOut.pTileInfo->macroAspectRatio;
+            surf->tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes;
+            surf->num_banks = AddrSurfInfoOut.pTileInfo->banks;
+         }
+      }
+   }
+
+   /* Calculate texture layout information for stencil. */
+   if (surf->flags & RADEON_SURF_SBUFFER) {
+      AddrSurfInfoIn.bpp = 8;
+      /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
+      AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split;
+
+      for (level = 0; level <= surf->last_level; level++) {
+         r = compute_level(ws, surf, true, level, type, compressed,
+                           &AddrSurfInfoIn, &AddrSurfInfoOut);
+         if (r)
+            return r;
+
+         if (level == 0) {
+            surf->stencil_offset = surf->stencil_level[0].offset;
+
+            /* For 2D modes only. */
+            if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
+               surf->stencil_tile_split =
+                     AddrSurfInfoOut.pTileInfo->tileSplitBytes;
+            }
+         }
+      }
+   }
+
+   return 0;
+}
+
+static int amdgpu_surface_best(struct radeon_winsys *rws,
+                               struct radeon_surf *surf)
+{
+   return 0;
+}
+
+void amdgpu_surface_init_functions(struct amdgpu_winsys *ws)
+{
+   ws->base.surface_init = amdgpu_surface_init;
+   ws->base.surface_best = amdgpu_surface_best;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
new file mode 100644
index 00000000000..012c9003b69
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright © 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright © 2009 Joakim Sindholt <opensource@zhasha.com>
+ * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#include "amdgpu_cs.h"
+#include "amdgpu_public.h"
+
+#include "util/u_hash_table.h"
+#include <amdgpu_drm.h>
+#include <xf86drm.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include "amdgpu_id.h"
+
+#define CIK_TILE_MODE_COLOR_2D			14
+
+#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)        (((x) >> 6) & 0x1f)
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P2               0
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16          4
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16         5
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32         6
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32         7
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16    8
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16    9
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16    10
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16   11
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16   12
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32   13
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32   14
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16   16
+#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16  17
+
+static struct util_hash_table *dev_tab = NULL;
+pipe_static_mutex(dev_tab_mutex);
+
+static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
+{
+   unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D];
+
+   switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) {
+   case CIK__PIPE_CONFIG__ADDR_SURF_P2:
+   default:
+       return 2;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32:
+       return 4;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32:
+       return 8;
+   case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16:
+   case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16:
+       return 16;
+   }
+}
+
+/* Convert Sea Islands register values GB_ADDR_CFG and MC_ADDR_CFG
+ * into GB_TILING_CONFIG register which is only present on R600-R700. */
+static unsigned r600_get_gb_tiling_config(struct amdgpu_gpu_info *info)
+{
+   unsigned num_pipes = info->gb_addr_cfg & 0x7;
+   unsigned num_banks = info->mc_arb_ramcfg & 0x3;
+   unsigned pipe_interleave_bytes = (info->gb_addr_cfg >> 4) & 0x7;
+   unsigned row_size = (info->gb_addr_cfg >> 28) & 0x3;
+
+   return num_pipes | (num_banks << 4) |
+         (pipe_interleave_bytes << 8) |
+         (row_size << 12);
+}
+
+/* Helper function to do the ioctls needed for setup and init. */
+static boolean do_winsys_init(struct amdgpu_winsys *ws)
+{
+   struct amdgpu_buffer_size_alignments alignment_info = {};
+   struct amdgpu_heap_info vram, gtt;
+   struct drm_amdgpu_info_hw_ip dma = {}, uvd = {}, vce = {};
+   uint32_t vce_version = 0, vce_feature = 0;
+   int r;
+
+   /* Query hardware and driver information. */
+   r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_buffer_size_alignment(ws->dev, &alignment_info);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_DMA, 0, &dma);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_UVD, 0, &uvd);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_VCE, 0, &vce);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n");
+      goto fail;
+   }
+
+   r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_VCE, 0, 0,
+				     &vce_version, &vce_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
+      goto fail;
+   }
+
+   /* Set chip identification. */
+   ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */
+   ws->info.vce_harvest_config = ws->amdinfo.vce_harvest_config;
+
+   switch (ws->info.pci_id) {
+#define CHIPSET(pci_id, name, cfamily) case pci_id: ws->info.family = CHIP_##cfamily; break;
+#include "pci_ids/radeonsi_pci_ids.h"
+#undef CHIPSET
+
+   default:
+      fprintf(stderr, "amdgpu: Invalid PCI ID.\n");
+      goto fail;
+   }
+
+   if (ws->info.family >= CHIP_TONGA)
+      ws->info.chip_class = VI;
+   else if (ws->info.family >= CHIP_BONAIRE)
+      ws->info.chip_class = CIK;
+   else {
+      fprintf(stderr, "amdgpu: Unknown family.\n");
+      goto fail;
+   }
+
+   /* LLVM 3.6 is required for VI. */
+   if (ws->info.chip_class >= VI &&
+       (HAVE_LLVM < 0x0306 ||
+        (HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 1))) {
+      fprintf(stderr, "amdgpu: LLVM 3.6.1 is required, got LLVM %i.%i.%i\n",
+              HAVE_LLVM >> 8, HAVE_LLVM & 255, MESA_LLVM_VERSION_PATCH);
+      goto fail;
+   }
+
+   /* family and rev_id are for addrlib */
+   switch (ws->info.family) {
+   case CHIP_BONAIRE:
+      ws->family = FAMILY_CI;
+      ws->rev_id = CI_BONAIRE_M_A0;
+      break;
+   case CHIP_KAVERI:
+      ws->family = FAMILY_KV;
+      ws->rev_id = KV_SPECTRE_A0;
+      break;
+   case CHIP_KABINI:
+      ws->family = FAMILY_KV;
+      ws->rev_id = KB_KALINDI_A0;
+      break;
+   case CHIP_HAWAII:
+      ws->family = FAMILY_CI;
+      ws->rev_id = CI_HAWAII_P_A0;
+      break;
+   case CHIP_MULLINS:
+      ws->family = FAMILY_KV;
+      ws->rev_id = ML_GODAVARI_A0;
+      break;
+   case CHIP_TONGA:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_TONGA_P_A0;
+      break;
+   case CHIP_ICELAND:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_ICELAND_M_A0;
+      break;
+   case CHIP_CARRIZO:
+      ws->family = FAMILY_CZ;
+      ws->rev_id = CZ_CARRIZO_A0;
+      break;
+   case CHIP_FIJI:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_FIJI_P_A0;
+      break;
+   default:
+      fprintf(stderr, "amdgpu: Unknown family.\n");
+      goto fail;
+   }
+
+   ws->addrlib = amdgpu_addr_create(ws);
+   if (!ws->addrlib) {
+      fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
+      goto fail;
+   }
+
+   /* Set hardware information. */
+   ws->info.gart_size = gtt.heap_size;
+   ws->info.vram_size = vram.heap_size;
+   /* convert the shader clock from KHz to MHz */
+   ws->info.max_sclk = ws->amdinfo.max_engine_clk / 1000;
+   ws->info.max_compute_units = 1; /* TODO */
+   ws->info.max_se = ws->amdinfo.num_shader_engines;
+   ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine;
+   ws->info.has_uvd = uvd.available_rings != 0;
+   ws->info.vce_fw_version =
+         vce.available_rings ? vce_version : 0;
+   ws->info.has_userptr = TRUE;
+   ws->info.r600_num_backends = ws->amdinfo.rb_pipes;
+   ws->info.r600_clock_crystal_freq = ws->amdinfo.gpu_counter_freq;
+   ws->info.r600_tiling_config = r600_get_gb_tiling_config(&ws->amdinfo);
+   ws->info.r600_num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo);
+   ws->info.r600_max_pipes = ws->amdinfo.max_quad_shader_pipes; /* TODO: is this correct? */
+   ws->info.r600_virtual_address = TRUE;
+   ws->info.r600_has_dma = dma.available_rings != 0;
+
+   memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode,
+          sizeof(ws->amdinfo.gb_tile_mode));
+   ws->info.si_tile_mode_array_valid = TRUE;
+   ws->info.si_backend_enabled_mask = ws->amdinfo.enabled_rb_pipes_mask;
+
+   memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode,
+          sizeof(ws->amdinfo.gb_macro_tile_mode));
+   ws->info.cik_macrotile_mode_array_valid = TRUE;
+
+   ws->gart_page_size = alignment_info.size_remote;
+
+   return TRUE;
+
+fail:
+   if (ws->addrlib)
+      AddrDestroy(ws->addrlib);
+   amdgpu_device_deinitialize(ws->dev);
+   ws->dev = NULL;
+   return FALSE;
+}
+
+static void amdgpu_winsys_destroy(struct radeon_winsys *rws)
+{
+   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
+
+   pipe_mutex_destroy(ws->bo_fence_lock);
+
+   ws->cman->destroy(ws->cman);
+   ws->kman->destroy(ws->kman);
+   AddrDestroy(ws->addrlib);
+
+   amdgpu_device_deinitialize(ws->dev);
+   FREE(rws);
+}
+
+static void amdgpu_winsys_query_info(struct radeon_winsys *rws,
+                                     struct radeon_info *info)
+{
+   *info = ((struct amdgpu_winsys *)rws)->info;
+}
+
+static boolean amdgpu_cs_request_feature(struct radeon_winsys_cs *rcs,
+                                         enum radeon_feature_id fid,
+                                         boolean enable)
+{
+   return FALSE;
+}
+
+static uint64_t amdgpu_query_value(struct radeon_winsys *rws,
+                                   enum radeon_value_id value)
+{
+   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
+   struct amdgpu_heap_info heap;
+   uint64_t retval = 0;
+
+   switch (value) {
+   case RADEON_REQUESTED_VRAM_MEMORY:
+      return ws->allocated_vram;
+   case RADEON_REQUESTED_GTT_MEMORY:
+      return ws->allocated_gtt;
+   case RADEON_BUFFER_WAIT_TIME_NS:
+      return ws->buffer_wait_time;
+   case RADEON_TIMESTAMP:
+      amdgpu_query_info(ws->dev, AMDGPU_INFO_TIMESTAMP, 8, &retval);
+      return retval;
+   case RADEON_NUM_CS_FLUSHES:
+      return ws->num_cs_flushes;
+   case RADEON_NUM_BYTES_MOVED:
+      amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_BYTES_MOVED, 8, &retval);
+      return retval;
+   case RADEON_VRAM_USAGE:
+      amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &heap);
+      return heap.heap_usage;
+   case RADEON_GTT_USAGE:
+      amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, &heap);
+      return heap.heap_usage;
+   case RADEON_GPU_TEMPERATURE:
+   case RADEON_CURRENT_SCLK:
+   case RADEON_CURRENT_MCLK:
+      return 0;
+   case RADEON_GPU_RESET_COUNTER:
+      assert(0);
+      return 0;
+   }
+   return 0;
+}
+
+static void amdgpu_read_registers(struct radeon_winsys *rws,
+                                  unsigned reg_offset,
+                                  unsigned num_registers, uint32_t *out)
+{
+   struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
+
+   amdgpu_read_mm_registers(ws->dev, reg_offset / 4, num_registers,
+                            0xffffffff, 0, out);
+}
+
+static unsigned hash_dev(void *key)
+{
+#if defined(PIPE_ARCH_X86_64)
+   return pointer_to_intptr(key) ^ (pointer_to_intptr(key) >> 32);
+#else
+   return pointer_to_intptr(key);
+#endif
+}
+
+static int compare_dev(void *key1, void *key2)
+{
+   return key1 != key2;
+}
+
+static bool amdgpu_winsys_unref(struct radeon_winsys *ws)
+{
+   struct amdgpu_winsys *rws = (struct amdgpu_winsys*)ws;
+   bool destroy;
+
+   /* When the reference counter drops to zero, remove the device pointer
+    * from the table.
+    * This must happen while the mutex is locked, so that
+    * amdgpu_winsys_create in another thread doesn't get the winsys
+    * from the table when the counter drops to 0. */
+   pipe_mutex_lock(dev_tab_mutex);
+
+   destroy = pipe_reference(&rws->reference, NULL);
+   if (destroy && dev_tab)
+      util_hash_table_remove(dev_tab, rws->dev);
+
+   pipe_mutex_unlock(dev_tab_mutex);
+   return destroy;
+}
+
+PUBLIC struct radeon_winsys *
+amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
+{
+   struct amdgpu_winsys *ws;
+   drmVersionPtr version = drmGetVersion(fd);
+   amdgpu_device_handle dev;
+   uint32_t drm_major, drm_minor, r;
+
+   /* The DRM driver version of amdgpu is 3.x.x. */
+   if (version->version_major != 3) {
+      drmFreeVersion(version);
+      return NULL;
+   }
+   drmFreeVersion(version);
+
+   /* Look up the winsys from the dev table. */
+   pipe_mutex_lock(dev_tab_mutex);
+   if (!dev_tab)
+      dev_tab = util_hash_table_create(hash_dev, compare_dev);
+
+   /* Initialize the amdgpu device. This should always return the same pointer
+    * for the same fd. */
+   r = amdgpu_device_initialize(fd, &drm_major, &drm_minor, &dev);
+   if (r) {
+      pipe_mutex_unlock(dev_tab_mutex);
+      fprintf(stderr, "amdgpu: amdgpu_device_initialize failed.\n");
+      return NULL;
+   }
+
+   /* Lookup a winsys if we have already created one for this device. */
+   ws = util_hash_table_get(dev_tab, dev);
+   if (ws) {
+      pipe_reference(NULL, &ws->reference);
+      pipe_mutex_unlock(dev_tab_mutex);
+      return &ws->base;
+   }
+
+   /* Create a new winsys. */
+   ws = CALLOC_STRUCT(amdgpu_winsys);
+   if (!ws) {
+      pipe_mutex_unlock(dev_tab_mutex);
+      return NULL;
+   }
+
+   ws->dev = dev;
+   ws->info.drm_major = drm_major;
+   ws->info.drm_minor = drm_minor;
+
+   if (!do_winsys_init(ws))
+      goto fail;
+
+   /* Create managers. */
+   ws->kman = amdgpu_bomgr_create(ws);
+   if (!ws->kman)
+      goto fail;
+   ws->cman = pb_cache_manager_create(ws->kman, 500000, 2.0f, 0,
+			(ws->info.vram_size + ws->info.gart_size) / 8);
+   if (!ws->cman)
+      goto fail;
+
+   /* init reference */
+   pipe_reference_init(&ws->reference, 1);
+
+   /* Set functions. */
+   ws->base.unref = amdgpu_winsys_unref;
+   ws->base.destroy = amdgpu_winsys_destroy;
+   ws->base.query_info = amdgpu_winsys_query_info;
+   ws->base.cs_request_feature = amdgpu_cs_request_feature;
+   ws->base.query_value = amdgpu_query_value;
+   ws->base.read_registers = amdgpu_read_registers;
+
+   amdgpu_bomgr_init_functions(ws);
+   amdgpu_cs_init_functions(ws);
+   amdgpu_surface_init_functions(ws);
+
+   pipe_mutex_init(ws->bo_fence_lock);
+
+   /* Create the screen at the end. The winsys must be initialized
+    * completely.
+    *
+    * Alternatively, we could create the screen based on "ws->gen"
+    * and link all drivers into one binary blob. */
+   ws->base.screen = screen_create(&ws->base);
+   if (!ws->base.screen) {
+      amdgpu_winsys_destroy(&ws->base);
+      pipe_mutex_unlock(dev_tab_mutex);
+      return NULL;
+   }
+
+   util_hash_table_set(dev_tab, dev, ws);
+
+   /* We must unlock the mutex once the winsys is fully initialized, so that
+    * other threads attempting to create the winsys from the same fd will
+    * get a fully initialized winsys and not just half-way initialized. */
+   pipe_mutex_unlock(dev_tab_mutex);
+
+   return &ws->base;
+
+fail:
+   pipe_mutex_unlock(dev_tab_mutex);
+   if (ws->cman)
+      ws->cman->destroy(ws->cman);
+   if (ws->kman)
+      ws->kman->destroy(ws->kman);
+   FREE(ws);
+   return NULL;
+}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
new file mode 100644
index 00000000000..4d07644c9ef
--- /dev/null
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2009 Corbin Simpson
+ * Copyright © 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Marek Olšák <maraeo@gmail.com>
+ */
+
+#ifndef AMDGPU_WINSYS_H
+#define AMDGPU_WINSYS_H
+
+#include "gallium/drivers/radeon/radeon_winsys.h"
+#include "addrlib/addrinterface.h"
+#include "os/os_thread.h"
+#include <amdgpu.h>
+
+struct amdgpu_cs;
+
+struct amdgpu_winsys {
+   struct radeon_winsys base;
+   struct pipe_reference reference;
+
+   amdgpu_device_handle dev;
+
+   pipe_mutex bo_fence_lock;
+
+   int num_cs; /* The number of command streams created. */
+   uint32_t next_bo_unique_id;
+   uint64_t allocated_vram;
+   uint64_t allocated_gtt;
+   uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */
+   uint64_t num_cs_flushes;
+   unsigned gart_page_size;
+
+   struct radeon_info info;
+
+   struct pb_manager *kman;
+   struct pb_manager *cman;
+
+   struct amdgpu_gpu_info amdinfo;
+   ADDR_HANDLE addrlib;
+   uint32_t rev_id;
+   unsigned family;
+};
+
+static inline struct amdgpu_winsys *
+amdgpu_winsys(struct radeon_winsys *base)
+{
+   return (struct amdgpu_winsys*)base;
+}
+
+void amdgpu_surface_init_functions(struct amdgpu_winsys *ws);
+ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws);
+
+#endif
diff --git a/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c b/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
index 9fedb121565..93ce6f224fe 100644
--- a/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
+++ b/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
@@ -26,7 +26,7 @@ struct i915_drm_batchbuffer
    drm_intel_bo *bo;
 };
 
-static INLINE struct i915_drm_batchbuffer *
+static inline struct i915_drm_batchbuffer *
 i915_drm_batchbuffer(struct i915_winsys_batchbuffer *batch)
 {
    return (struct i915_drm_batchbuffer *)batch;
diff --git a/src/gallium/winsys/i915/drm/i915_drm_winsys.h b/src/gallium/winsys/i915/drm/i915_drm_winsys.h
index 7f0d718bdb7..56b9e150497 100644
--- a/src/gallium/winsys/i915/drm/i915_drm_winsys.h
+++ b/src/gallium/winsys/i915/drm/i915_drm_winsys.h
@@ -28,7 +28,7 @@ struct i915_drm_winsys
    drm_intel_bufmgr *gem_manager;
 };
 
-static INLINE struct i915_drm_winsys *
+static inline struct i915_drm_winsys *
 i915_drm_winsys(struct i915_winsys *iws)
 {
    return (struct i915_drm_winsys *)iws;
@@ -58,13 +58,13 @@ struct i915_drm_buffer {
    unsigned flink;
 };
 
-static INLINE struct i915_drm_buffer *
+static inline struct i915_drm_buffer *
 i915_drm_buffer(struct i915_winsys_buffer *buffer)
 {
    return (struct i915_drm_buffer *)buffer;
 }
 
-static INLINE drm_intel_bo *
+static inline drm_intel_bo *
 intel_bo(struct i915_winsys_buffer *buffer)
 {
    return i915_drm_buffer(buffer)->bo;
diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
index 063524655b6..c6603e38a00 100644
--- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
@@ -17,7 +17,7 @@ static struct util_hash_table *fd_tab = NULL;
 
 pipe_static_mutex(nouveau_screen_mutex);
 
-boolean nouveau_drm_screen_unref(struct nouveau_screen *screen)
+bool nouveau_drm_screen_unref(struct nouveau_screen *screen)
 {
 	int ret;
 	if (screen->refcount == -1)
@@ -120,7 +120,11 @@ nouveau_drm_screen_create(int fd)
 	if (!screen)
 		goto err;
 
-	util_hash_table_set(fd_tab, intptr_to_pointer(fd), screen);
+	/* Use dupfd in hash table, to avoid errors if the original fd gets
+	 * closed by its owner. The hash key needs to live at least as long as
+	 * the screen.
+	 */
+	util_hash_table_set(fd_tab, intptr_to_pointer(dupfd), screen);
 	screen->refcount = 1;
 	pipe_mutex_unlock(nouveau_screen_mutex);
 	return &screen->base;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index fe98870967a..3a9ac445b24 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -44,7 +44,7 @@
 
 static const struct pb_vtbl radeon_bo_vtbl;
 
-static INLINE struct radeon_bo *radeon_bo(struct pb_buffer *bo)
+static inline struct radeon_bo *radeon_bo(struct pb_buffer *bo)
 {
     assert(bo->vtbl == &radeon_bo_vtbl);
     return (struct radeon_bo *)bo;
@@ -78,7 +78,7 @@ struct radeon_bomgr {
     struct list_head va_holes;
 };
 
-static INLINE struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr)
+static inline struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr)
 {
     return (struct radeon_bomgr *)mgr;
 }
@@ -101,33 +101,30 @@ static struct radeon_bo *get_radeon_bo(struct pb_buffer *_buf)
     return bo;
 }
 
-static void radeon_bo_wait(struct pb_buffer *_buf, enum radeon_bo_usage usage)
+static bool radeon_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
+                           enum radeon_bo_usage usage)
 {
-    struct radeon_bo *bo = get_radeon_bo(_buf);
-    struct drm_radeon_gem_wait_idle args = {0};
+   struct radeon_bo *bo = get_radeon_bo(_buf);
 
-    while (p_atomic_read(&bo->num_active_ioctls)) {
-        sched_yield();
+   /* Wait if any ioctl is being submitted with this buffer. */
+   if (!os_wait_until_zero(&bo->num_active_ioctls, timeout))
+      return false;
+
+   /* TODO: handle arbitrary timeout */
+    if (!timeout) {
+        struct drm_radeon_gem_busy args = {0};
+
+        args.handle = bo->handle;
+        return drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_BUSY,
+                                   &args, sizeof(args)) == 0;
+    } else {
+        struct drm_radeon_gem_wait_idle args = {0};
+
+        args.handle = bo->handle;
+        while (drmCommandWrite(bo->rws->fd, DRM_RADEON_GEM_WAIT_IDLE,
+                               &args, sizeof(args)) == -EBUSY);
+        return true;
     }
-
-    args.handle = bo->handle;
-    while (drmCommandWrite(bo->rws->fd, DRM_RADEON_GEM_WAIT_IDLE,
-                           &args, sizeof(args)) == -EBUSY);
-}
-
-static boolean radeon_bo_is_busy(struct pb_buffer *_buf,
-                                 enum radeon_bo_usage usage)
-{
-    struct radeon_bo *bo = get_radeon_bo(_buf);
-    struct drm_radeon_gem_busy args = {0};
-
-    if (p_atomic_read(&bo->num_active_ioctls)) {
-        return TRUE;
-    }
-
-    args.handle = bo->handle;
-    return drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_BUSY,
-                               &args, sizeof(args)) != 0;
 }
 
 static enum radeon_bo_domain get_valid_domain(enum radeon_bo_domain domain)
@@ -305,14 +302,34 @@ static void radeon_bo_destroy(struct pb_buffer *_buf)
     if (bo->ptr)
         os_munmap(bo->ptr, bo->base.size);
 
+    if (mgr->va) {
+        if (bo->rws->va_unmap_working) {
+            struct drm_radeon_gem_va va;
+
+            va.handle = bo->handle;
+            va.vm_id = 0;
+            va.operation = RADEON_VA_UNMAP;
+            va.flags = RADEON_VM_PAGE_READABLE |
+                       RADEON_VM_PAGE_WRITEABLE |
+                       RADEON_VM_PAGE_SNOOPED;
+            va.offset = bo->va;
+
+            if (drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_VA, &va,
+				    sizeof(va)) != 0 &&
+		va.operation == RADEON_VA_RESULT_ERROR) {
+                fprintf(stderr, "radeon: Failed to deallocate virtual address for buffer:\n");
+                fprintf(stderr, "radeon:    size      : %d bytes\n", bo->base.size);
+                fprintf(stderr, "radeon:    va        : 0x%016llx\n", (unsigned long long)bo->va);
+            }
+	}
+
+	radeon_bomgr_free_va(mgr, bo->va, bo->base.size);
+    }
+
     /* Close object. */
     args.handle = bo->handle;
     drmIoctl(bo->rws->fd, DRM_IOCTL_GEM_CLOSE, &args);
 
-    if (mgr->va) {
-        radeon_bomgr_free_va(mgr, bo->va, bo->base.size);
-    }
-
     pipe_mutex_destroy(bo->map_mutex);
 
     if (bo->initial_domain & RADEON_DOMAIN_VRAM)
@@ -331,14 +348,11 @@ void *radeon_bo_do_map(struct radeon_bo *bo)
     if (bo->user_ptr)
         return bo->user_ptr;
 
-    /* Return the pointer if it's already mapped. */
-    if (bo->ptr)
-        return bo->ptr;
-
     /* Map the buffer. */
     pipe_mutex_lock(bo->map_mutex);
-    /* Return the pointer if it's already mapped (in case of a race). */
+    /* Return the pointer if it's already mapped. */
     if (bo->ptr) {
+        bo->map_count++;
         pipe_mutex_unlock(bo->map_mutex);
         return bo->ptr;
     }
@@ -363,6 +377,7 @@ void *radeon_bo_do_map(struct radeon_bo *bo)
         return NULL;
     }
     bo->ptr = ptr;
+    bo->map_count = 1;
     pipe_mutex_unlock(bo->map_mutex);
 
     return bo->ptr;
@@ -392,8 +407,8 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                     return NULL;
                 }
 
-                if (radeon_bo_is_busy((struct pb_buffer*)bo,
-                                      RADEON_USAGE_WRITE)) {
+                if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
+                                    RADEON_USAGE_WRITE)) {
                     return NULL;
                 }
             } else {
@@ -402,8 +417,8 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                     return NULL;
                 }
 
-                if (radeon_bo_is_busy((struct pb_buffer*)bo,
-                                      RADEON_USAGE_READWRITE)) {
+                if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
+                                    RADEON_USAGE_READWRITE)) {
                     return NULL;
                 }
             }
@@ -421,7 +436,7 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                 if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
                     cs->flush_cs(cs->flush_data, 0, NULL);
                 }
-                radeon_bo_wait((struct pb_buffer*)bo,
+                radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                                RADEON_USAGE_WRITE);
             } else {
                 /* Mapping for write. */
@@ -435,7 +450,8 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                     }
                 }
 
-                radeon_bo_wait((struct pb_buffer*)bo, RADEON_USAGE_READWRITE);
+                radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
+                               RADEON_USAGE_READWRITE);
             }
 
             bo->mgr->rws->buffer_wait_time += os_time_get_nano() - time;
@@ -447,7 +463,26 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
 
 static void radeon_bo_unmap(struct radeon_winsys_cs_handle *_buf)
 {
-    /* NOP */
+    struct radeon_bo *bo = (struct radeon_bo*)_buf;
+
+    if (bo->user_ptr)
+        return;
+
+    pipe_mutex_lock(bo->map_mutex);
+    if (!bo->ptr) {
+        pipe_mutex_unlock(bo->map_mutex);
+        return; /* it's not been mapped */
+    }
+
+    assert(bo->map_count);
+    if (--bo->map_count) {
+        pipe_mutex_unlock(bo->map_mutex);
+        return; /* it's been mapped multiple times */
+    }
+
+    os_munmap(bo->ptr, bo->base.size);
+    bo->ptr = NULL;
+    pipe_mutex_unlock(bo->map_mutex);
 }
 
 static void radeon_bo_get_base_buffer(struct pb_buffer *buf,
@@ -607,7 +642,7 @@ static boolean radeon_bomgr_is_buffer_busy(struct pb_manager *_mgr,
        return TRUE;
    }
 
-   if (radeon_bo_is_busy((struct pb_buffer*)bo, RADEON_USAGE_READWRITE)) {
+   if (!radeon_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) {
        return TRUE;
    }
 
@@ -739,10 +774,11 @@ static void radeon_bo_set_tiling(struct pb_buffer *_buf,
                                  struct radeon_winsys_cs *rcs,
                                  enum radeon_bo_layout microtiled,
                                  enum radeon_bo_layout macrotiled,
+                                 unsigned pipe_config,
                                  unsigned bankw, unsigned bankh,
                                  unsigned tile_split,
                                  unsigned stencil_tile_split,
-                                 unsigned mtilea,
+                                 unsigned mtilea, unsigned num_banks,
                                  uint32_t pitch,
                                  bool scanout)
 {
@@ -758,9 +794,7 @@ static void radeon_bo_set_tiling(struct pb_buffer *_buf,
         cs->flush_cs(cs->flush_data, 0, NULL);
     }
 
-    while (p_atomic_read(&bo->num_active_ioctls)) {
-        sched_yield();
-    }
+    os_wait_until_zero(&bo->num_active_ioctls, PIPE_TIMEOUT_INFINITE);
 
     if (microtiled == RADEON_LAYOUT_TILED)
         args.tiling_flags |= RADEON_TILING_MICRO;
@@ -820,6 +854,12 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
     memset(&desc, 0, sizeof(desc));
     desc.base.alignment = alignment;
 
+    /* Align size to page size. This is the minimum alignment for normal
+     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
+     * like constant/uniform buffers, can benefit from better and more reuse.
+     */
+    size = align(size, 4096);
+
     /* Only set one usage bit each for domains and flags, or the cache manager
      * might consider different sets of domains / flags compatible
      */
@@ -1125,7 +1165,6 @@ void radeon_bomgr_init_functions(struct radeon_drm_winsys *ws)
     ws->base.buffer_map = radeon_bo_map;
     ws->base.buffer_unmap = radeon_bo_unmap;
     ws->base.buffer_wait = radeon_bo_wait;
-    ws->base.buffer_is_busy = radeon_bo_is_busy;
     ws->base.buffer_create = radeon_winsys_bo_create;
     ws->base.buffer_from_handle = radeon_winsys_bo_from_handle;
     ws->base.buffer_from_ptr = radeon_winsys_bo_from_ptr;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
index b83ce168b4e..f8f50cc5d5b 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
@@ -54,6 +54,7 @@ struct radeon_bo {
 
     void *ptr;
     pipe_mutex map_mutex;
+    unsigned map_count;
 
     uint32_t handle;
     uint32_t flink_name;
@@ -71,7 +72,7 @@ struct radeon_bo {
 struct pb_manager *radeon_bomgr_create(struct radeon_drm_winsys *rws);
 void radeon_bomgr_init_functions(struct radeon_drm_winsys *ws);
 
-static INLINE
+static inline
 void radeon_bo_reference(struct radeon_bo **dst, struct radeon_bo *src)
 {
     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index ecf89578c68..7a267f9acbf 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -80,22 +80,39 @@ radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
 static void radeon_fence_reference(struct pipe_fence_handle **dst,
                                    struct pipe_fence_handle *src);
 
+static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
+{
+    /* No context support here. Just return the winsys pointer
+     * as the "context". */
+    return (struct radeon_winsys_ctx*)ws;
+}
+
+static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
+{
+    /* No context support here. */
+}
+
 static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
                                       struct radeon_drm_winsys *ws)
 {
     int i;
 
+    csc->buf = MALLOC(ws->ib_max_size);
+    if (!csc->buf)
+        return FALSE;
     csc->fd = ws->fd;
     csc->nrelocs = 512;
     csc->relocs_bo = (struct radeon_bo**)
                      CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
     if (!csc->relocs_bo) {
+        FREE(csc->buf);
         return FALSE;
     }
 
     csc->relocs = (struct drm_radeon_cs_reloc*)
                   CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
     if (!csc->relocs) {
+        FREE(csc->buf);
         FREE(csc->relocs_bo);
         return FALSE;
     }
@@ -148,18 +165,19 @@ static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
     radeon_cs_context_cleanup(csc);
     FREE(csc->relocs_bo);
     FREE(csc->relocs);
+    FREE(csc->buf);
 }
 
 
 static struct radeon_winsys_cs *
-radeon_drm_cs_create(struct radeon_winsys *rws,
+radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
                      enum ring_type ring_type,
                      void (*flush)(void *ctx, unsigned flags,
                                    struct pipe_fence_handle **fence),
                      void *flush_ctx,
                      struct radeon_winsys_cs_handle *trace_buf)
 {
-    struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
+    struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
     struct radeon_drm_cs *cs;
 
     cs = CALLOC_STRUCT(radeon_drm_cs);
@@ -188,6 +206,7 @@ radeon_drm_cs_create(struct radeon_winsys *rws,
     cs->cst = &cs->csc2;
     cs->base.buf = cs->csc->buf;
     cs->base.ring_type = ring_type;
+    cs->base.max_dw = ws->ib_max_size / 4;
 
     p_atomic_inc(&ws->num_cs);
     return &cs->base;
@@ -195,7 +214,7 @@ radeon_drm_cs_create(struct radeon_winsys *rws,
 
 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
 
-static INLINE void update_reloc(struct drm_radeon_cs_reloc *reloc,
+static inline void update_reloc(struct drm_radeon_cs_reloc *reloc,
                                 enum radeon_bo_domain rd,
                                 enum radeon_bo_domain wd,
                                 unsigned priority,
@@ -372,20 +391,29 @@ static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
 static boolean radeon_drm_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
-    boolean status =
-        (cs->csc->used_gart + gtt) < cs->ws->info.gart_size * 0.7 &&
-        (cs->csc->used_vram + vram) < cs->ws->info.vram_size * 0.7;
 
-    return status;
+    vram += cs->csc->used_vram;
+    gtt += cs->csc->used_gart;
+
+    /* Anything that goes above the VRAM size should go to GTT. */
+    if (vram > cs->ws->info.vram_size)
+        gtt += vram - cs->ws->info.vram_size;
+
+    /* Now we just need to check if we have enough GTT. */
+    return gtt < cs->ws->info.gart_size * 0.7;
 }
 
 void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs_context *csc)
 {
     unsigned i;
+    int r;
 
-    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
-                            &csc->cs, sizeof(struct drm_radeon_cs))) {
-        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
+    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
+                            &csc->cs, sizeof(struct drm_radeon_cs));
+    if (r) {
+	if (r == -ENOMEM)
+	    fprintf(stderr, "radeon: Not enough memory for command submission.\n");
+	else if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
             unsigned i;
 
             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
@@ -467,7 +495,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
         break;
     }
 
-    if (rcs->cdw > RADEON_MAX_CMDBUF_DWORDS) {
+    if (rcs->cdw > rcs->max_dw) {
        fprintf(stderr, "radeon: command stream overflowed\n");
     }
 
@@ -486,7 +514,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
     cs->cst->cs_trace_id = cs_trace_id;
 
     /* If the CS is not empty or overflowed, emit it in a separate thread. */
-    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS && !debug_get_option_noop()) {
+    if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
         unsigned i, crelocs;
 
         crelocs = cs->cst->crelocs;
@@ -522,6 +550,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
 
         default:
         case RING_GFX:
+        case RING_COMPUTE:
             cs->cst->flags[0] = 0;
             cs->cst->flags[1] = RADEON_CS_RING_GFX;
             cs->cst->cs.num_chunks = 2;
@@ -537,7 +566,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
                 cs->cst->cs.num_chunks = 3;
             }
-            if (flags & RADEON_FLUSH_COMPUTE) {
+            if (cs->base.ring_type == RING_COMPUTE) {
                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
                 cs->cst->cs.num_chunks = 3;
             }
@@ -625,7 +654,7 @@ static bool radeon_fence_wait(struct radeon_winsys *ws,
     struct pb_buffer *rfence = (struct pb_buffer*)fence;
 
     if (timeout == 0)
-        return !ws->buffer_is_busy(rfence, RADEON_USAGE_READWRITE);
+        return ws->buffer_wait(rfence, 0, RADEON_USAGE_READWRITE);
 
     if (timeout != PIPE_TIMEOUT_INFINITE) {
         int64_t start_time = os_time_get();
@@ -634,7 +663,7 @@ static bool radeon_fence_wait(struct radeon_winsys *ws,
         timeout /= 1000;
 
         /* Wait in a loop. */
-        while (ws->buffer_is_busy(rfence, RADEON_USAGE_READWRITE)) {
+        while (!ws->buffer_wait(rfence, 0, RADEON_USAGE_READWRITE)) {
             if (os_time_get() - start_time >= timeout) {
                 return FALSE;
             }
@@ -643,7 +672,7 @@ static bool radeon_fence_wait(struct radeon_winsys *ws,
         return TRUE;
     }
 
-    ws->buffer_wait(rfence, RADEON_USAGE_READWRITE);
+    ws->buffer_wait(rfence, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_READWRITE);
     return TRUE;
 }
 
@@ -655,6 +684,8 @@ static void radeon_fence_reference(struct pipe_fence_handle **dst,
 
 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
 {
+    ws->base.ctx_create = radeon_drm_ctx_create;
+    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
     ws->base.cs_create = radeon_drm_cs_create;
     ws->base.cs_destroy = radeon_drm_cs_destroy;
     ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index fcc29fe9480..ab154945880 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -30,7 +30,7 @@
 #include "radeon_drm_bo.h"
 
 struct radeon_cs_context {
-    uint32_t                    buf[RADEON_MAX_CMDBUF_DWORDS];
+    uint32_t                    *buf;
 
     int                         fd;
     struct drm_radeon_cs        cs;
@@ -79,13 +79,13 @@ struct radeon_drm_cs {
 
 int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo);
 
-static INLINE struct radeon_drm_cs *
+static inline struct radeon_drm_cs *
 radeon_drm_cs(struct radeon_winsys_cs *base)
 {
     return (struct radeon_drm_cs*)base;
 }
 
-static INLINE boolean
+static inline boolean
 radeon_bo_is_referenced_by_cs(struct radeon_drm_cs *cs,
                               struct radeon_bo *bo)
 {
@@ -94,7 +94,7 @@ radeon_bo_is_referenced_by_cs(struct radeon_drm_cs *cs,
            (num_refs && radeon_get_reloc(cs->csc, bo) != -1);
 }
 
-static INLINE boolean
+static inline boolean
 radeon_bo_is_referenced_by_cs_for_write(struct radeon_drm_cs *cs,
                                         struct radeon_bo *bo)
 {
@@ -110,7 +110,7 @@ radeon_bo_is_referenced_by_cs_for_write(struct radeon_drm_cs *cs,
     return cs->csc->relocs[index].write_domain != 0;
 }
 
-static INLINE boolean
+static inline boolean
 radeon_bo_is_referenced_by_any_cs(struct radeon_bo *bo)
 {
     return bo->num_cs_references != 0;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index ba8d1437b6f..b70bbaa54a3 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -57,6 +57,12 @@
 #define RADEON_INFO_READ_REG		0x24
 #endif
 
+#define RADEON_INFO_VA_UNMAP_WORKING	0x25
+
+#ifndef RADEON_INFO_GPU_RESET_COUNTER
+#define RADEON_INFO_GPU_RESET_COUNTER   0x26
+#endif
+
 static struct util_hash_table *fd_tab = NULL;
 pipe_static_mutex(fd_tab_mutex);
 
@@ -389,16 +395,22 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
         }
 
         ws->info.r600_virtual_address = FALSE;
-        if (ws->info.drm_minor >= 13) {
-            uint32_t ib_vm_max_size;
+        ws->ib_max_size = 64 * 1024;
 
+        if (ws->info.drm_minor >= 13) {
             ws->info.r600_virtual_address = TRUE;
             if (!radeon_get_drm_value(ws->fd, RADEON_INFO_VA_START, NULL,
                                       &ws->va_start))
                 ws->info.r600_virtual_address = FALSE;
-            if (!radeon_get_drm_value(ws->fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL,
-                                      &ib_vm_max_size))
+
+            if (radeon_get_drm_value(ws->fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL,
+                                     &ws->ib_max_size))
+                ws->ib_max_size *= 4; /* the kernel returns the size in dwords */
+            else
                 ws->info.r600_virtual_address = FALSE;
+
+            radeon_get_drm_value(ws->fd, RADEON_INFO_VA_UNMAP_WORKING, NULL,
+                                 &ws->va_unmap_working);
         }
 	if (ws->gen == DRV_R600 && !debug_get_bool_option("RADEON_VA", FALSE))
 		ws->info.r600_virtual_address = FALSE;
@@ -484,6 +496,10 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws)
     if (ws->gen >= DRV_R600) {
         radeon_surface_manager_free(ws->surf_man);
     }
+
+    if (ws->fd >= 0)
+        close(ws->fd);
+
     FREE(rws);
 }
 
@@ -563,6 +579,10 @@ static uint64_t radeon_query_value(struct radeon_winsys *rws,
         radeon_get_drm_value(ws->fd, RADEON_INFO_CURRENT_GPU_MCLK,
                              "current-gpu-mclk", (uint32_t*)&retval);
         return retval;
+    case RADEON_GPU_RESET_COUNTER:
+        radeon_get_drm_value(ws->fd, RADEON_INFO_GPU_RESET_COUNTER,
+                             "gpu-reset-counter", (uint32_t*)&retval);
+        return retval;
     }
     return 0;
 }
@@ -696,7 +716,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
         return NULL;
     }
 
-    ws->fd = fd;
+    ws->fd = dup(fd);
 
     if (!do_winsys_init(ws))
         goto fail;
@@ -706,13 +726,13 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
     if (!ws->kman)
         goto fail;
 
-    ws->cman = pb_cache_manager_create(ws->kman, 1000000, 2.0f, 0,
+    ws->cman = pb_cache_manager_create(ws->kman, 500000, 2.0f, 0,
                                        MIN2(ws->info.vram_size, ws->info.gart_size));
     if (!ws->cman)
         goto fail;
 
     if (ws->gen >= DRV_R600) {
-        ws->surf_man = radeon_surface_manager_new(fd);
+        ws->surf_man = radeon_surface_manager_new(ws->fd);
         if (!ws->surf_man)
             goto fail;
     }
@@ -753,7 +773,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
         return NULL;
     }
 
-    util_hash_table_set(fd_tab, intptr_to_pointer(fd), ws);
+    util_hash_table_set(fd_tab, intptr_to_pointer(ws->fd), ws);
 
     /* We must unlock the mutex once the winsys is fully initialized, so that
      * other threads attempting to create the winsys from the same fd will
@@ -770,6 +790,9 @@ fail:
         ws->kman->destroy(ws->kman);
     if (ws->surf_man)
         radeon_surface_manager_free(ws->surf_man);
+    if (ws->fd >= 0)
+        close(ws->fd);
+
     FREE(ws);
     return NULL;
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index 166b6b93d28..c1a8d6ae564 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -73,7 +73,9 @@ struct radeon_drm_winsys {
 
     enum radeon_generation gen;
     struct radeon_info info;
+    uint32_t ib_max_size;
     uint32_t va_start;
+    uint32_t va_unmap_working;
     uint32_t accel_working2;
 
     struct pb_manager *kman;
@@ -96,7 +98,7 @@ struct radeon_drm_winsys {
     struct radeon_drm_cs *cs_stack[RING_LAST];
 };
 
-static INLINE struct radeon_drm_winsys *
+static inline struct radeon_drm_winsys *
 radeon_drm_winsys(struct radeon_winsys *base)
 {
     return (struct radeon_drm_winsys*)base;
diff --git a/src/gallium/winsys/svga/drm/SConscript b/src/gallium/winsys/svga/drm/SConscript
index 099acdac8c0..25850531d31 100644
--- a/src/gallium/winsys/svga/drm/SConscript
+++ b/src/gallium/winsys/svga/drm/SConscript
@@ -8,7 +8,6 @@ if env['gcc'] or env['clang'] or env['icc']:
     env.Append(CCFLAGS = ['-fvisibility=hidden'])
     env.Append(CPPDEFINES = [
         'HAVE_STDINT_H', 
-        'HAVE_SYS_TYPES_H',
         '-D_FILE_OFFSET_BITS=64',
     ])
 
diff --git a/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c b/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
index fceb0897058..5ef95f3d6a9 100644
--- a/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
+++ b/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
@@ -127,7 +127,7 @@ struct fenced_buffer
 };
 
 
-static INLINE struct fenced_manager *
+static inline struct fenced_manager *
 fenced_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -135,7 +135,7 @@ fenced_manager(struct pb_manager *mgr)
 }
 
 
-static INLINE struct fenced_buffer *
+static inline struct fenced_buffer *
 fenced_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -204,7 +204,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
 }
 
 
-static INLINE void
+static inline void
 fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
                              struct fenced_buffer *fenced_buf)
 {
@@ -228,7 +228,7 @@ fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
  *
  * Reference count should be incremented before calling this function.
  */
-static INLINE void
+static inline void
 fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
                          struct fenced_buffer *fenced_buf)
 {
@@ -252,7 +252,7 @@ fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
  *
  * Returns TRUE if the buffer was detroyed.
  */
-static INLINE boolean
+static inline boolean
 fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
@@ -289,7 +289,7 @@ fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
  * This function will release and re-acquire the mutex, so any copy of mutable
  * state must be discarded after calling it.
  */
-static INLINE enum pipe_error
+static inline enum pipe_error
 fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
@@ -430,7 +430,7 @@ fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
  * This function is a shorthand around pb_manager::create_buffer for
  * fenced_buffer_create_gpu_storage_locked()'s benefit.
  */
-static INLINE boolean
+static inline boolean
 fenced_buffer_try_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
                                             struct fenced_buffer *fenced_buf,
                                             const struct pb_desc *desc)
diff --git a/src/gallium/winsys/svga/drm/vmw_buffer.c b/src/gallium/winsys/svga/drm/vmw_buffer.c
index c516054b7fc..7eab3d050e4 100644
--- a/src/gallium/winsys/svga/drm/vmw_buffer.c
+++ b/src/gallium/winsys/svga/drm/vmw_buffer.c
@@ -69,7 +69,7 @@ struct vmw_gmr_buffer
 extern const struct pb_vtbl vmw_gmr_buffer_vtbl;
 
 
-static INLINE struct vmw_gmr_buffer *
+static inline struct vmw_gmr_buffer *
 vmw_gmr_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -86,7 +86,7 @@ struct vmw_gmr_bufmgr
 };
 
 
-static INLINE struct vmw_gmr_bufmgr *
+static inline struct vmw_gmr_bufmgr *
 vmw_gmr_bufmgr(struct pb_manager *mgr)
 {
    assert(mgr);
diff --git a/src/gallium/winsys/svga/drm/vmw_buffer.h b/src/gallium/winsys/svga/drm/vmw_buffer.h
index e0bb8085a48..b9cbb25541f 100644
--- a/src/gallium/winsys/svga/drm/vmw_buffer.h
+++ b/src/gallium/winsys/svga/drm/vmw_buffer.h
@@ -59,7 +59,7 @@ struct debug_flush_buf *
 vmw_debug_flush_buf(struct svga_winsys_buffer *buffer);
 
 #else
-static INLINE struct pb_buffer *
+static inline struct pb_buffer *
 vmw_pb_buffer(struct svga_winsys_buffer *buffer)
 {
    assert(buffer);
@@ -67,7 +67,7 @@ vmw_pb_buffer(struct svga_winsys_buffer *buffer)
 }
 
 
-static INLINE struct svga_winsys_buffer *
+static inline struct svga_winsys_buffer *
 vmw_svga_winsys_buffer_wrap(struct pb_buffer *buffer)
 {
    return (struct svga_winsys_buffer *)buffer;
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c
index 4e1c41db886..31bedde7c41 100644
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -152,7 +152,7 @@ struct vmw_svga_winsys_context
 };
 
 
-static INLINE struct vmw_svga_winsys_context *
+static inline struct vmw_svga_winsys_context *
 vmw_svga_winsys_context(struct svga_winsys_context *swc)
 {
    assert(swc);
@@ -160,7 +160,7 @@ vmw_svga_winsys_context(struct svga_winsys_context *swc)
 }
 
 
-static INLINE unsigned
+static inline unsigned
 vmw_translate_to_pb_flags(unsigned flags)
 {
    unsigned f = 0;
diff --git a/src/gallium/winsys/svga/drm/vmw_fence.c b/src/gallium/winsys/svga/drm/vmw_fence.c
index 1b24239a7ce..17822ce27fd 100644
--- a/src/gallium/winsys/svga/drm/vmw_fence.c
+++ b/src/gallium/winsys/svga/drm/vmw_fence.c
@@ -67,7 +67,7 @@ struct vmw_fence
  * @ops: Pointer to a struct pb_fence_ops.
  *
  */
-static INLINE boolean
+static inline boolean
 vmw_fence_seq_is_signaled(uint32_t seq, uint32_t last, uint32_t cur)
 {
    return (cur - last <= cur - seq);
@@ -81,7 +81,7 @@ vmw_fence_seq_is_signaled(uint32_t seq, uint32_t last, uint32_t cur)
  * @ops: Pointer to a struct pb_fence_ops.
  *
  */
-static INLINE struct vmw_fence_ops *
+static inline struct vmw_fence_ops *
 vmw_fence_ops(struct pb_fence_ops *ops)
 {
    assert(ops);
@@ -162,7 +162,7 @@ out_unlock:
  *
  * @fence: The opaque pipe fence handle.
  */
-static INLINE struct vmw_fence *
+static inline struct vmw_fence *
 vmw_fence(struct pipe_fence_handle *fence)
 {
    return (struct vmw_fence *) fence;
diff --git a/src/gallium/winsys/svga/drm/vmw_screen.h b/src/gallium/winsys/svga/drm/vmw_screen.h
index fd76e614a5e..ce98db9b397 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen.h
+++ b/src/gallium/winsys/svga/drm/vmw_screen.h
@@ -102,7 +102,7 @@ struct vmw_winsys_screen
 };
 
 
-static INLINE struct vmw_winsys_screen *
+static inline struct vmw_winsys_screen *
 vmw_winsys_screen(struct svga_winsys_screen *base)
 {
    return (struct vmw_winsys_screen *)base;
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_dri.c b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
index 9f335900e68..e70e0fec4a3 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_dri.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
@@ -126,7 +126,7 @@ out_no_vws:
    return NULL;
 }
 
-static INLINE boolean
+static inline boolean
 vmw_dri1_intersect_src_bbox(struct drm_clip_rect *dst,
 			    int dst_x,
 			    int dst_y,
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
index 14c3b2068c6..e2f0da58bf9 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
@@ -650,7 +650,7 @@ vmw_ioctl_fence_unref(struct vmw_winsys_screen *vws,
       vmw_error("%s Failed\n", __FUNCTION__);
 }
 
-static INLINE uint32_t
+static inline uint32_t
 vmw_drm_fence_flags(uint32_t flags)
 {
     uint32_t dflags = 0;
diff --git a/src/gallium/winsys/svga/drm/vmw_shader.h b/src/gallium/winsys/svga/drm/vmw_shader.h
index 1fd8c3311f9..28f99717391 100644
--- a/src/gallium/winsys/svga/drm/vmw_shader.h
+++ b/src/gallium/winsys/svga/drm/vmw_shader.h
@@ -47,14 +47,14 @@ struct vmw_svga_winsys_shader
    uint32_t shid;
 };
 
-static INLINE struct svga_winsys_gb_shader *
+static inline struct svga_winsys_gb_shader *
 svga_winsys_shader(struct vmw_svga_winsys_shader *shader)
 {
    assert(!shader || shader->shid != SVGA3D_INVALID_ID);
    return (struct svga_winsys_gb_shader *)shader;
 }
 
-static INLINE struct vmw_svga_winsys_shader *
+static inline struct vmw_svga_winsys_shader *
 vmw_svga_winsys_shader(struct svga_winsys_gb_shader *shader)
 {
    return (struct vmw_svga_winsys_shader *)shader;
diff --git a/src/gallium/winsys/svga/drm/vmw_surface.h b/src/gallium/winsys/svga/drm/vmw_surface.h
index e44d0554fbc..1291f380aa2 100644
--- a/src/gallium/winsys/svga/drm/vmw_surface.h
+++ b/src/gallium/winsys/svga/drm/vmw_surface.h
@@ -68,7 +68,7 @@ struct vmw_svga_winsys_surface
 };
 
 
-static INLINE struct svga_winsys_surface *
+static inline struct svga_winsys_surface *
 svga_winsys_surface(struct vmw_svga_winsys_surface *surf)
 {
    assert(!surf || surf->sid != SVGA3D_INVALID_ID);
@@ -76,7 +76,7 @@ svga_winsys_surface(struct vmw_svga_winsys_surface *surf)
 }
 
 
-static INLINE struct vmw_svga_winsys_surface *
+static inline struct vmw_svga_winsys_surface *
 vmw_svga_winsys_surface(struct svga_winsys_surface *surf)
 {
    return (struct vmw_svga_winsys_surface *)surf;
diff --git a/src/gallium/winsys/sw/dri/dri_sw_winsys.c b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
index 6fed22bbd7c..8451d832806 100644
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -55,13 +55,13 @@ struct dri_sw_winsys
    struct drisw_loader_funcs *lf;
 };
 
-static INLINE struct dri_sw_displaytarget *
+static inline struct dri_sw_displaytarget *
 dri_sw_displaytarget( struct sw_displaytarget *dt )
 {
    return (struct dri_sw_displaytarget *)dt;
 }
 
-static INLINE struct dri_sw_winsys *
+static inline struct dri_sw_winsys *
 dri_sw_winsys( struct sw_winsys *ws )
 {
    return (struct dri_sw_winsys *)ws;
diff --git a/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c b/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
index aae3ec55a25..dc725f4b90c 100644
--- a/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
+++ b/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
@@ -62,7 +62,7 @@ struct gdi_sw_displaytarget
 
 
 /** Cast wrapper */
-static INLINE struct gdi_sw_displaytarget *
+static inline struct gdi_sw_displaytarget *
 gdi_sw_displaytarget( struct sw_displaytarget *buf )
 {
    return (struct gdi_sw_displaytarget *)buf;
diff --git a/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c b/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c
index a71d2a76791..89dd5471b09 100644
--- a/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c
+++ b/src/gallium/winsys/sw/hgl/hgl_sw_winsys.c
@@ -67,7 +67,7 @@ struct haiku_displaytarget
 
 
 // Cast
-static INLINE struct haiku_displaytarget*
+static inline struct haiku_displaytarget*
 hgl_sw_displaytarget(struct sw_displaytarget* target)
 {
 	return (struct haiku_displaytarget *)target;
diff --git a/src/gallium/winsys/sw/kms-dri/SConscript b/src/gallium/winsys/sw/kms-dri/SConscript
deleted file mode 100644
index e7dd721dd13..00000000000
--- a/src/gallium/winsys/sw/kms-dri/SConscript
+++ /dev/null
@@ -1,23 +0,0 @@
-#######################################################################
-# SConscript for kms-dri winsys
-
-
-Import('*')
-
-if env['platform'] not in ('linux'):
-    Return()
-
-env = env.Clone()
-
-env.PkgUseModules('DRM')
-
-env.Append(CPPPATH = [
-    '#/src/gallium/include',
-    '#/src/gallium/auxiliary',
-])
-
-ws_kms_dri = env.ConvenienceLibrary(
-    target = 'ws_kms_dri',
-    source = env.ParseSourceList('Makefile.sources', 'C_SOURCES'),
-)
-Export('ws_kms_dri')
diff --git a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
index 740b9201140..900c49f83e6 100644
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -83,13 +83,13 @@ struct kms_sw_winsys
    struct list_head bo_list;
 };
 
-static INLINE struct kms_sw_displaytarget *
+static inline struct kms_sw_displaytarget *
 kms_sw_displaytarget( struct sw_displaytarget *dt )
 {
    return (struct kms_sw_displaytarget *)dt;
 }
 
-static INLINE struct kms_sw_winsys *
+static inline struct kms_sw_winsys *
 kms_sw_winsys( struct sw_winsys *ws )
 {
    return (struct kms_sw_winsys *)ws;
diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
index a6bf4985e1e..9b90eaa018b 100644
--- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
+++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
@@ -66,13 +66,13 @@ struct wrapper_sw_displaytarget
    void *ptr;
 };
 
-static INLINE struct wrapper_sw_winsys *
+static inline struct wrapper_sw_winsys *
 wrapper_sw_winsys(struct sw_winsys *ws)
 {
    return (struct wrapper_sw_winsys *)ws;
 }
 
-static INLINE struct wrapper_sw_displaytarget *
+static inline struct wrapper_sw_displaytarget *
 wrapper_sw_displaytarget(struct sw_displaytarget *dt)
 {
    return (struct wrapper_sw_displaytarget *)dt;
diff --git a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
index 88310718049..515ecd9f7b7 100644
--- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
+++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
@@ -92,7 +92,7 @@ struct xlib_sw_winsys
 
 
 /** Cast wrapper */
-static INLINE struct xlib_displaytarget *
+static inline struct xlib_displaytarget *
 xlib_displaytarget(struct sw_displaytarget *dt)
 {
    return (struct xlib_displaytarget *) dt;
diff --git a/src/gbm/Makefile.am b/src/gbm/Makefile.am
index 918fdf7d6ad..9a584cab352 100644
--- a/src/gbm/Makefile.am
+++ b/src/gbm/Makefile.am
@@ -1,5 +1,3 @@
-AUTOMAKE_OPTIONS = subdir-objects
-
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = main/gbm.pc
 
@@ -41,18 +39,15 @@ libgbm_la_LIBADD += $(top_builddir)/src/egl/wayland/wayland-drm/libwayland-drm.l
 endif
 
 if HAVE_DRI2
-noinst_LTLIBRARIES = libgbm_dri.la
-libgbm_dri_la_SOURCES = \
+libgbm_la_SOURCES += \
 	backends/dri/gbm_dri.c \
 	backends/dri/gbm_driint.h
 
-libgbm_dri_la_CFLAGS = \
-	$(AM_CFLAGS) \
+AM_CFLAGS += \
 	-DDEFAULT_DRIVER_DIR='"$(DRI_DRIVER_SEARCH_DIR)"' \
 	$(LIBDRM_CFLAGS)
 
 libgbm_la_LIBADD += \
-	libgbm_dri.la \
 	$(LIBDRM_LIBS)
 endif
 
diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 74da9e5b979..2ab40506e97 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index a234ac6f8e2..b491ad4d36f 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -33,6 +33,7 @@ NIR_FILES = \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
 	nir/nir_lower_global_vars_to_local.c \
+	nir/nir_lower_load_const_to_scalar.c \
 	nir/nir_lower_locals_to_regs.c \
 	nir/nir_lower_idiv.c \
 	nir/nir_lower_io.c \
@@ -55,6 +56,7 @@ NIR_FILES = \
 	nir/nir_opt_peephole_ffma.c \
 	nir/nir_opt_peephole_select.c \
 	nir/nir_opt_remove_phis.c \
+	nir/nir_opt_undef.c \
 	nir/nir_print.c \
 	nir/nir_remove_dead_variables.c \
 	nir/nir_search.c \
@@ -157,6 +159,8 @@ LIBGLSL_FILES = \
 	lower_packed_varyings.cpp \
 	lower_named_interface_blocks.cpp \
 	lower_packing_builtins.cpp \
+	lower_subroutine.cpp \
+	lower_tess_level.cpp \
 	lower_texture_projection.cpp \
 	lower_variable_index_to_cond_assign.cpp \
 	lower_vec_index_to_cond_assign.cpp \
diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index c52e518334d..eb6d8461671 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -304,6 +304,16 @@ private:
     * Is this function call actually a constructor?
     */
    bool cons;
+   ir_rvalue *
+   handle_method(exec_list *instructions,
+                 struct _mesa_glsl_parse_state *state);
+};
+
+class ast_subroutine_list : public ast_node
+{
+public:
+   virtual void print(void) const;
+   exec_list declarations;
 };
 
 class ast_array_specifier : public ast_node {
@@ -434,7 +444,9 @@ struct ast_type_qualifier {
 	 unsigned out:1;
 	 unsigned centroid:1;
          unsigned sample:1;
+	 unsigned patch:1;
 	 unsigned uniform:1;
+	 unsigned buffer:1;
 	 unsigned smooth:1;
 	 unsigned flat:1;
 	 unsigned noperspective:1;
@@ -518,6 +530,22 @@ struct ast_type_qualifier {
          /** \name Vulkan qualifiers */
          unsigned vk_set:1;
 
+	 /** \name Layout qualifiers for GL_ARB_tessellation_shader */
+	 /** \{ */
+	 /* tess eval input layout */
+	 /* gs prim_type reused for primitive mode */
+	 unsigned vertex_spacing:1;
+	 unsigned ordering:1;
+	 unsigned point_mode:1;
+	 /* tess control output layout */
+	 unsigned vertices:1;
+	 /** \} */
+
+         /** \name Qualifiers for GL_ARB_shader_subroutine */
+	 /** \{ */
+         unsigned subroutine:1;  /**< Is this marked 'subroutine' */
+         unsigned subroutine_def:1; /**< Is this marked 'subroutine' with a list of types */
+	 /** \} */
       }
       /** \brief Set of flags, accessed by name. */
       q;
@@ -553,7 +581,10 @@ struct ast_type_qualifier {
    /** Stream in GLSL 1.50 geometry shaders. */
    unsigned stream;
 
-   /** Input or output primitive type in GLSL 1.50 geometry shaders */
+   /**
+    * Input or output primitive type in GLSL 1.50 geometry shaders
+    * and tessellation shaders.
+    */
    GLenum prim_type;
 
    /**
@@ -580,6 +611,18 @@ struct ast_type_qualifier {
     */
    int local_size[3];
 
+   /** Tessellation evaluation shader: vertex spacing (equal, fractional even/odd) */
+   GLenum vertex_spacing;
+
+   /** Tessellation evaluation shader: vertex ordering (CW or CCW) */
+   GLenum ordering;
+
+   /** Tessellation evaluation shader: point mode */
+   bool point_mode;
+
+   /** Tessellation control shader: number of output vertices */
+   int vertices;
+
    /**
     * Image format specified with an ARB_shader_image_load_store
     * layout qualifier.
@@ -640,11 +683,17 @@ struct ast_type_qualifier {
 			_mesa_glsl_parse_state *state,
 			ast_type_qualifier q);
 
+   bool merge_out_qualifier(YYLTYPE *loc,
+                           _mesa_glsl_parse_state *state,
+                           ast_type_qualifier q,
+                           ast_node* &node);
+
    bool merge_in_qualifier(YYLTYPE *loc,
                            _mesa_glsl_parse_state *state,
                            ast_type_qualifier q,
                            ast_node* &node);
 
+   ast_subroutine_list *subroutine_list;
 };
 
 class ast_declarator_list;
@@ -1039,6 +1088,27 @@ public:
 };
 
 
+/**
+ * AST node representing a declaration of the output layout for tessellation
+ * control shaders.
+ */
+class ast_tcs_output_layout : public ast_node
+{
+public:
+   ast_tcs_output_layout(const struct YYLTYPE &locp, int vertices)
+      : vertices(vertices)
+   {
+      set_location(locp);
+   }
+
+   virtual ir_rvalue *hir(exec_list *instructions,
+                          struct _mesa_glsl_parse_state *state);
+
+private:
+   const int vertices;
+};
+
+
 /**
  * AST node representing a declaration of the input layout for geometry
  * shaders.
diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index 752d86f72fd..27e84d101ec 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -107,6 +107,33 @@ update_max_array_access(ir_rvalue *ir, int idx, YYLTYPE *loc,
 }
 
 
+static int
+get_implicit_array_size(struct _mesa_glsl_parse_state *state,
+                        ir_rvalue *array)
+{
+   ir_variable *var = array->variable_referenced();
+
+   /* Inputs in control shader are implicitly sized
+    * to the maximum patch size.
+    */
+   if (state->stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_in) {
+      return state->Const.MaxPatchVertices;
+   }
+
+   /* Non-patch inputs in evaluation shader are implicitly sized
+    * to the maximum patch size.
+    */
+   if (state->stage == MESA_SHADER_TESS_EVAL &&
+       var->data.mode == ir_var_shader_in &&
+       !var->data.patch) {
+      return state->Const.MaxPatchVertices;
+   }
+
+   return 0;
+}
+
+
 ir_rvalue *
 _mesa_ast_array_index_to_hir(void *mem_ctx,
 			     struct _mesa_glsl_parse_state *state,
@@ -183,7 +210,25 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
          update_max_array_access(array, idx, &loc, state);
    } else if (const_index == NULL && array->type->is_array()) {
       if (array->type->is_unsized_array()) {
-	 _mesa_glsl_error(&loc, state, "unsized array index must be constant");
+         int implicit_size = get_implicit_array_size(state, array);
+         if (implicit_size) {
+            ir_variable *v = array->whole_variable_referenced();
+            if (v != NULL)
+               v->data.max_array_access = implicit_size - 1;
+         }
+         else if (state->stage == MESA_SHADER_TESS_CTRL &&
+                  array->variable_referenced()->data.mode == ir_var_shader_out &&
+                  !array->variable_referenced()->data.patch) {
+            /* Tessellation control shader output non-patch arrays are
+             * initially unsized. Despite that, they are allowed to be
+             * indexed with a non-constant expression (typically
+             * "gl_InvocationID"). The array size will be determined
+             * by the linker.
+             */
+         }
+         else {
+            _mesa_glsl_error(&loc, state, "unsized array index must be constant");
+         }
       } else if (array->type->fields.array->is_interface()
                  && array->variable_referenced()->data.mode == ir_var_uniform
                  && !state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) {
@@ -226,24 +271,24 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
        * dynamically uniform expression is undefined.
        */
       if (array->type->without_array()->is_sampler()) {
-	 if (!state->is_version(130, 100)) {
-	    if (state->es_shader) {
-	       _mesa_glsl_warning(&loc, state,
-				  "sampler arrays indexed with non-constant "
-				  "expressions is optional in %s",
-				  state->get_version_string());
-	    } else {
-	       _mesa_glsl_warning(&loc, state,
-				  "sampler arrays indexed with non-constant "
-				  "expressions will be forbidden in GLSL 1.30 "
-				  "and later");
-	    }
-	 } else if (!state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) {
-	    _mesa_glsl_error(&loc, state,
-			     "sampler arrays indexed with non-constant "
-			     "expressions is forbidden in GLSL 1.30 and "
-			     "later");
-	 }
+         if (!state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) {
+            if (state->is_version(130, 300))
+               _mesa_glsl_error(&loc, state,
+                                "sampler arrays indexed with non-constant "
+                                "expressions are forbidden in GLSL %s "
+                                "and later",
+                                state->es_shader ? "ES 3.00" : "1.30");
+            else if (state->es_shader)
+               _mesa_glsl_warning(&loc, state,
+                                  "sampler arrays indexed with non-constant "
+                                  "expressions will be forbidden in GLSL "
+                                  "3.00 and later");
+            else
+               _mesa_glsl_warning(&loc, state,
+                                  "sampler arrays indexed with non-constant "
+                                  "expressions will be forbidden in GLSL "
+                                  "1.30 and later");
+         }
       }
    }
 
diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index 92e26bf2416..803edf5a14d 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -26,6 +26,7 @@
 #include "glsl_types.h"
 #include "ir.h"
 #include "main/core.h" /* for MIN2 */
+#include "main/shaderobj.h"
 
 static ir_rvalue *
 convert_component(ir_rvalue *src, const glsl_type *desired_type);
@@ -355,6 +356,8 @@ fix_parameter(void *mem_ctx, ir_rvalue *actual, const glsl_type *formal_type,
 static ir_rvalue *
 generate_call(exec_list *instructions, ir_function_signature *sig,
 	      exec_list *actual_parameters,
+              ir_variable *sub_var,
+	      ir_rvalue *array_idx,
 	      struct _mesa_glsl_parse_state *state)
 {
    void *ctx = state;
@@ -421,7 +424,8 @@ generate_call(exec_list *instructions, ir_function_signature *sig,
 
       deref = new(ctx) ir_dereference_variable(var);
    }
-   ir_call *call = new(ctx) ir_call(sig, deref, actual_parameters);
+
+   ir_call *call = new(ctx) ir_call(sig, deref, actual_parameters, sub_var, array_idx);
    instructions->push_tail(call);
 
    /* Also emit any necessary out-parameter conversions. */
@@ -489,6 +493,40 @@ done:
    return sig;
 }
 
+static ir_function_signature *
+match_subroutine_by_name(const char *name,
+                         exec_list *actual_parameters,
+                         struct _mesa_glsl_parse_state *state,
+                         ir_variable **var_r)
+{
+   void *ctx = state;
+   ir_function_signature *sig = NULL;
+   ir_function *f, *found = NULL;
+   const char *new_name;
+   ir_variable *var;
+   bool is_exact = false;
+
+   new_name = ralloc_asprintf(ctx, "%s_%s", _mesa_shader_stage_to_subroutine_prefix(state->stage), name);
+   var = state->symbols->get_variable(new_name);
+   if (!var)
+      return NULL;
+
+   for (int i = 0; i < state->num_subroutine_types; i++) {
+      f = state->subroutine_types[i];
+      if (strcmp(f->name, var->type->without_array()->name))
+         continue;
+      found = f;
+      break;
+   }
+
+   if (!found)
+      return NULL;
+   *var_r = var;
+   sig = found->matching_signature(state, actual_parameters,
+                                  false, &is_exact);
+   return sig;
+}
+
 static void
 print_function_prototypes(_mesa_glsl_parse_state *state, YYLTYPE *loc,
                           ir_function *f)
@@ -1531,6 +1569,65 @@ process_record_constructor(exec_list *instructions,
                                              &actual_parameters, state);
 }
 
+ir_rvalue *
+ast_function_expression::handle_method(exec_list *instructions,
+                                       struct _mesa_glsl_parse_state *state)
+{
+   const ast_expression *field = subexpressions[0];
+   ir_rvalue *op;
+   ir_rvalue *result;
+   void *ctx = state;
+   /* Handle "method calls" in GLSL 1.20 - namely, array.length() */
+   YYLTYPE loc = get_location();
+   state->check_version(120, 300, &loc, "methods not supported");
+
+   const char *method;
+   method = field->primary_expression.identifier;
+
+   op = field->subexpressions[0]->hir(instructions, state);
+   if (strcmp(method, "length") == 0) {
+      if (!this->expressions.is_empty()) {
+         _mesa_glsl_error(&loc, state, "length method takes no arguments");
+         goto fail;
+      }
+
+      if (op->type->is_array()) {
+         if (op->type->is_unsized_array()) {
+            _mesa_glsl_error(&loc, state, "length called on unsized array");
+            goto fail;
+         }
+
+         result = new(ctx) ir_constant(op->type->array_size());
+      } else if (op->type->is_vector()) {
+         if (state->ARB_shading_language_420pack_enable) {
+            /* .length() returns int. */
+            result = new(ctx) ir_constant((int) op->type->vector_elements);
+         } else {
+            _mesa_glsl_error(&loc, state, "length method on matrix only available"
+                             "with ARB_shading_language_420pack");
+            goto fail;
+         }
+      } else if (op->type->is_matrix()) {
+         if (state->ARB_shading_language_420pack_enable) {
+            /* .length() returns int. */
+            result = new(ctx) ir_constant((int) op->type->matrix_columns);
+         } else {
+            _mesa_glsl_error(&loc, state, "length method on matrix only available"
+                             "with ARB_shading_language_420pack");
+            goto fail;
+         }
+      } else {
+         _mesa_glsl_error(&loc, state, "length called on scalar.");
+         goto fail;
+      }
+   } else {
+         _mesa_glsl_error(&loc, state, "unknown method: `%s'", method);
+         goto fail;
+   }
+   return result;
+fail:
+   return ir_rvalue::error_value(ctx);
+}
 
 ir_rvalue *
 ast_function_expression::hir(exec_list *instructions,
@@ -1543,8 +1640,6 @@ ast_function_expression::hir(exec_list *instructions,
     * 2. methods - Only the .length() method of array types.
     * 3. functions - Calls to regular old functions.
     *
-    * Method calls are actually detected when the ast_field_selection
-    * expression is handled.
     */
    if (is_constructor()) {
       const ast_type_specifier *type = (ast_type_specifier *) subexpressions[0];
@@ -1765,11 +1860,22 @@ ast_function_expression::hir(exec_list *instructions,
 					       &actual_parameters,
 					       ctx);
       }
+   } else if (subexpressions[0]->oper == ast_field_selection) {
+      return handle_method(instructions, state);
    } else {
       const ast_expression *id = subexpressions[0];
-      const char *func_name = id->primary_expression.identifier;
+      const char *func_name;
       YYLTYPE loc = get_location();
       exec_list actual_parameters;
+      ir_variable *sub_var = NULL;
+      ir_rvalue *array_idx = NULL;
+
+      if (id->oper == ast_array_index) {
+         func_name = id->subexpressions[0]->primary_expression.identifier;
+	 array_idx = id->subexpressions[1]->hir(instructions, state);
+      } else {
+         func_name = id->primary_expression.identifier;
+      }
 
       process_parameters(instructions, &actual_parameters, &this->expressions,
 			 state);
@@ -1778,6 +1884,10 @@ ast_function_expression::hir(exec_list *instructions,
 	 match_function_by_name(func_name, &actual_parameters, state);
 
       ir_rvalue *value = NULL;
+      if (sig == NULL) {
+         sig = match_subroutine_by_name(func_name, &actual_parameters, state, &sub_var);
+      }
+
       if (sig == NULL) {
 	 no_matching_function_error(func_name, &loc, &actual_parameters, state);
 	 value = ir_rvalue::error_value(ctx);
@@ -1785,7 +1895,14 @@ ast_function_expression::hir(exec_list *instructions,
 	 /* an error has already been emitted */
 	 value = ir_rvalue::error_value(ctx);
       } else {
-	 value = generate_call(instructions, sig, &actual_parameters, state);
+         value = generate_call(instructions, sig, &actual_parameters, sub_var, array_idx, state);
+         if (!value) {
+            ir_variable *const tmp = new(ctx) ir_variable(glsl_type::void_type,
+                                                          "void_var",
+                                                          ir_var_temporary);
+            instructions->push_tail(tmp);
+            value = new(ctx) ir_dereference_variable(tmp);
+         }
       }
 
       return value;
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 6896b700cd6..fa2c09d2697 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -54,6 +54,7 @@
 #include "ast.h"
 #include "glsl_types.h"
 #include "program/hash_table.h"
+#include "main/shaderobj.h"
 #include "ir.h"
 #include "ir_builder.h"
 
@@ -79,6 +80,7 @@ _mesa_ast_to_hir(exec_list *instructions, struct _mesa_glsl_parse_state *state)
    state->toplevel_ir = instructions;
 
    state->gs_input_prim_type_specified = false;
+   state->tcs_output_vertices_specified = false;
    state->cs_input_local_size_specified = false;
 
    /* Section 4.2 of the GLSL 1.20 specification states:
@@ -638,6 +640,34 @@ shift_result_type(const struct glsl_type *type_a,
    return type_a;
 }
 
+/**
+ * Returns the innermost array index expression in an rvalue tree.
+ * This is the largest indexing level -- if an array of blocks, then
+ * it is the block index rather than an indexing expression for an
+ * array-typed member of an array of blocks.
+ */
+static ir_rvalue *
+find_innermost_array_index(ir_rvalue *rv)
+{
+   ir_dereference_array *last = NULL;
+   while (rv) {
+      if (rv->as_dereference_array()) {
+         last = rv->as_dereference_array();
+         rv = last->array;
+      } else if (rv->as_dereference_record())
+         rv = rv->as_dereference_record()->record;
+      else if (rv->as_swizzle())
+         rv = rv->as_swizzle()->val;
+      else
+         rv = NULL;
+   }
+
+   if (last)
+      return last->array_index;
+
+   return NULL;
+}
+
 /**
  * Validates that a value can be assigned to a location with a specified type
  *
@@ -654,9 +684,9 @@ shift_result_type(const struct glsl_type *type_a,
  * In addition to being used for assignments, this function is used to
  * type-check return values.
  */
-ir_rvalue *
+static ir_rvalue *
 validate_assignment(struct _mesa_glsl_parse_state *state,
-                    YYLTYPE loc, const glsl_type *lhs_type,
+                    YYLTYPE loc, ir_rvalue *lhs,
                     ir_rvalue *rhs, bool is_initializer)
 {
    /* If there is already some error in the RHS, just return it.  Anything
@@ -665,9 +695,28 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
    if (rhs->type->is_error())
       return rhs;
 
+   /* In the Tessellation Control Shader:
+    * If a per-vertex output variable is used as an l-value, it is an error
+    * if the expression indicating the vertex number is not the identifier
+    * `gl_InvocationID`.
+    */
+   if (state->stage == MESA_SHADER_TESS_CTRL) {
+      ir_variable *var = lhs->variable_referenced();
+      if (var->data.mode == ir_var_shader_out && !var->data.patch) {
+         ir_rvalue *index = find_innermost_array_index(lhs);
+         ir_variable *index_var = index ? index->variable_referenced() : NULL;
+         if (!index_var || strcmp(index_var->name, "gl_InvocationID") != 0) {
+            _mesa_glsl_error(&loc, state,
+                             "Tessellation control shader outputs can only "
+                             "be indexed by gl_InvocationID");
+            return NULL;
+         }
+      }
+   }
+
    /* If the types are identical, the assignment can trivially proceed.
     */
-   if (rhs->type == lhs_type)
+   if (rhs->type == lhs->type)
       return rhs;
 
    /* If the array element types are the same and the LHS is unsized,
@@ -677,8 +726,8 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
     * Note: Whole-array assignments are not permitted in GLSL 1.10, but this
     * is handled by ir_dereference::is_lvalue.
     */
-   if (lhs_type->is_unsized_array() && rhs->type->is_array()
-       && (lhs_type->fields.array == rhs->type->fields.array)) {
+   if (lhs->type->is_unsized_array() && rhs->type->is_array()
+       && (lhs->type->fields.array == rhs->type->fields.array)) {
       if (is_initializer) {
          return rhs;
       } else {
@@ -689,8 +738,8 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
    }
 
    /* Check for implicit conversion in GLSL 1.20 */
-   if (apply_implicit_conversion(lhs_type, rhs, state)) {
-      if (rhs->type == lhs_type)
+   if (apply_implicit_conversion(lhs->type, rhs, state)) {
+      if (rhs->type == lhs->type)
 	 return rhs;
    }
 
@@ -698,7 +747,7 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
                     "%s of type %s cannot be assigned to "
                     "variable of type %s",
                     is_initializer ? "initializer" : "value",
-                    rhs->type->name, lhs_type->name);
+                    rhs->type->name, lhs->type->name);
 
    return NULL;
 }
@@ -733,7 +782,7 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state,
 
       if (unlikely(lhs_expr->operation == ir_binop_vector_extract)) {
          ir_rvalue *new_rhs =
-            validate_assignment(state, lhs_loc, lhs->type,
+            validate_assignment(state, lhs_loc, lhs,
                                 rhs, is_initializer);
 
          if (new_rhs == NULL) {
@@ -795,7 +844,7 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state,
    }
 
    ir_rvalue *new_rhs =
-      validate_assignment(state, lhs_loc, lhs->type, rhs, is_initializer);
+      validate_assignment(state, lhs_loc, lhs, rhs, is_initializer);
    if (new_rhs != NULL) {
       rhs = new_rhs;
 
@@ -972,6 +1021,7 @@ do_comparison(void *mem_ctx, int operation, ir_rvalue *op0, ir_rvalue *op1)
    case GLSL_TYPE_INTERFACE:
    case GLSL_TYPE_FUNCTION:
    case GLSL_TYPE_ATOMIC_UINT:
+   case GLSL_TYPE_SUBROUTINE:
       /* I assume a comparison of a struct containing a sampler just
        * ignores the sampler present in the type.
        */
@@ -1271,7 +1321,14 @@ ast_expression::do_hir(exec_list *instructions,
        *    applied to one operand that can make them match, in which
        *    case this conversion is done."
        */
-      if ((!apply_implicit_conversion(op[0]->type, op[1], state)
+
+      if (op[0]->type == glsl_type::void_type || op[1]->type == glsl_type::void_type) {
+         _mesa_glsl_error(& loc, state, "`%s':  wrong operand types: "
+                         "no operation `%1$s' exists that takes a left-hand "
+                         "operand of type 'void' or a right operand of type "
+                         "'void'", (this->oper == ast_equal) ? "==" : "!=");
+         error_emitted = true;
+      } else if ((!apply_implicit_conversion(op[0]->type, op[1], state)
            && !apply_implicit_conversion(op[1]->type, op[0], state))
           || (op[0]->type != op[1]->type)) {
          _mesa_glsl_error(& loc, state, "operands of `%s' must have the same "
@@ -2008,7 +2065,7 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
                                 const glsl_type *type,
                                 ir_variable *var)
 {
-   if (var && !var->is_in_uniform_block()) {
+   if (var && !var->is_in_buffer_block()) {
       /* Layout qualifiers may only apply to interface blocks and fields in
        * them.
        */
@@ -2045,9 +2102,10 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
                            ir_variable *var,
                            const ast_type_qualifier *qual)
 {
-   if (var->data.mode != ir_var_uniform) {
+   if (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage) {
       _mesa_glsl_error(loc, state,
-                       "the \"binding\" qualifier only applies to uniforms");
+                       "the \"binding\" qualifier only applies to uniforms and "
+                       "shader storage buffer objects");
       return false;
    }
 
@@ -2071,13 +2129,31 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
        *
        * The implementation-dependent maximum is GL_MAX_UNIFORM_BUFFER_BINDINGS.
        */
-      if (max_index >= ctx->Const.MaxUniformBufferBindings) {
+      if (var->data.mode == ir_var_uniform &&
+         max_index >= ctx->Const.MaxUniformBufferBindings) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) for %d UBOs exceeds "
                           "the maximum number of UBO binding points (%d)",
                           qual->binding, elements,
                           ctx->Const.MaxUniformBufferBindings);
          return false;
       }
+      /* SSBOs. From page 67 of the GLSL 4.30 specification:
+       * "If the binding point for any uniform or shader storage block instance
+       *  is less than zero, or greater than or equal to the
+       *  implementation-dependent maximum number of uniform buffer bindings, a
+       *  compile-time error will occur. When the binding identifier is used
+       *  with a uniform or shader storage block instanced as an array of size
+       *  N, all elements of the array from binding through binding + N – 1 must
+       *  be within this range."
+       */
+      if (var->data.mode == ir_var_shader_storage &&
+         max_index >= ctx->Const.MaxShaderStorageBufferBindings) {
+         _mesa_glsl_error(loc, state, "layout(binding = %d) for %d SSBOs exceeds "
+                          "the maximum number of SSBO binding points (%d)",
+                          qual->binding, elements,
+                          ctx->Const.MaxShaderStorageBufferBindings);
+         return false;
+      }
    } else if (var->type->is_sampler() ||
               (var->type->is_array() && var->type->fields.array->is_sampler())) {
       /* Samplers.  From page 63 of the GLSL 4.20 specification:
@@ -2206,6 +2282,8 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
     *                     input            output
     *                     -----            ------
     * vertex              explicit_loc     sso
+    * tess control        sso              sso
+    * tess eval           sso              sso
     * geometry            sso              sso
     * fragment            sso              explicit_loc
     */
@@ -2228,6 +2306,8 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
       fail = true;
       break;
 
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       if (var->data.mode == ir_var_shader_in || var->data.mode == ir_var_shader_out) {
          if (!state->check_separate_shader_objects_allowed(loc, var))
@@ -2287,8 +2367,13 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
                : (qual->location + VARYING_SLOT_VAR0);
             break;
 
+         case MESA_SHADER_TESS_CTRL:
+         case MESA_SHADER_TESS_EVAL:
          case MESA_SHADER_GEOMETRY:
-            var->data.location = qual->location + VARYING_SLOT_VAR0;
+            if (var->data.patch)
+               var->data.location = qual->location + VARYING_SLOT_PATCH0;
+            else
+               var->data.location = qual->location + VARYING_SLOT_VAR0;
             break;
 
          case MESA_SHADER_FRAGMENT:
@@ -2439,6 +2524,12 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       }
    }
 
+   if (qual->flags.q.subroutine && !qual->flags.q.uniform) {
+      _mesa_glsl_error(loc, state,
+                       "`subroutine' may only be applied to uniforms, "
+                       "subroutine type declarations, or function definitions");
+   }
+
    if (qual->flags.q.constant || qual->flags.q.attribute
        || qual->flags.q.uniform
        || (qual->flags.q.varying && (state->stage == MESA_SHADER_FRAGMENT)))
@@ -2455,6 +2546,9 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       var->data.stream = qual->stream;
    }
 
+   if (qual->flags.q.patch)
+      var->data.patch = 1;
+
    if (qual->flags.q.attribute && state->stage != MESA_SHADER_VERTEX) {
       var->type = glsl_type::error_type;
       _mesa_glsl_error(loc, state,
@@ -2502,6 +2596,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       var->data.mode = ir_var_shader_out;
    else if (qual->flags.q.uniform)
       var->data.mode = ir_var_uniform;
+   else if (qual->flags.q.buffer)
+      var->data.mode = ir_var_shader_storage;
 
    if (!is_parameter && is_varying_var(var, state->stage)) {
       /* User-defined ins/outs are not permitted in compute shaders. */
@@ -2565,7 +2661,9 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       case MESA_SHADER_VERTEX:
          if (var->data.mode == ir_var_shader_out)
             var->data.invariant = true;
-	      break;
+         break;
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
       case MESA_SHADER_GEOMETRY:
          if ((var->data.mode == ir_var_shader_in)
              || (var->data.mode == ir_var_shader_out))
@@ -2984,6 +3082,15 @@ process_initializer(ir_variable *var, ast_declaration *decl,
                            "cannot initialize uniforms");
    }
 
+   /* Section 4.3.7 "Buffer Variables" of the GLSL 4.30 spec:
+    *
+    *    "Buffer variables cannot have initializers."
+    */
+   if (var->data.mode == ir_var_shader_storage) {
+      _mesa_glsl_error(& initializer_loc, state,
+                       "SSBO variables cannot have initializers");
+   }
+
    /* From section 4.1.7 of the GLSL 4.40 spec:
     *
     *    "Opaque variables [...] are initialized only through the
@@ -3019,7 +3126,7 @@ process_initializer(ir_variable *var, ast_declaration *decl,
    if (type->qualifier.flags.q.constant
        || type->qualifier.flags.q.uniform) {
       ir_rvalue *new_rhs = validate_assignment(state, initializer_loc,
-                                               var->type, rhs, true);
+                                               lhs, rhs, true);
       if (new_rhs != NULL) {
          rhs = new_rhs;
 
@@ -3105,30 +3212,13 @@ process_initializer(ir_variable *var, ast_declaration *decl,
    return result;
 }
 
-
-/**
- * Do additional processing necessary for geometry shader input declarations
- * (this covers both interface blocks arrays and bare input variables).
- */
 static void
-handle_geometry_shader_input_decl(struct _mesa_glsl_parse_state *state,
-                                  YYLTYPE loc, ir_variable *var)
+validate_layout_qualifier_vertex_count(struct _mesa_glsl_parse_state *state,
+                                       YYLTYPE loc, ir_variable *var,
+                                       unsigned num_vertices,
+                                       unsigned *size,
+                                       const char *var_category)
 {
-   unsigned num_vertices = 0;
-   if (state->gs_input_prim_type_specified) {
-      num_vertices = vertices_per_prim(state->in_qualifier->prim_type);
-   }
-
-   /* Geometry shader input variables must be arrays.  Caller should have
-    * reported an error for this.
-    */
-   if (!var->type->is_array()) {
-      assert(state->error);
-
-      /* To avoid cascading failures, short circuit the checks below. */
-      return;
-   }
-
    if (var->type->is_unsized_array()) {
       /* Section 4.3.8.1 (Input Layout Qualifiers) of the GLSL 1.50 spec says:
        *
@@ -3138,6 +3228,8 @@ handle_geometry_shader_input_decl(struct _mesa_glsl_parse_state *state,
        *
        * Followed by a table mapping each allowed input layout qualifier to
        * the corresponding input length.
+       *
+       * Similarly for tessellation control shader outputs.
        */
       if (num_vertices != 0)
          var->type = glsl_type::get_array_instance(var->type->fields.array,
@@ -3164,22 +3256,101 @@ handle_geometry_shader_input_decl(struct _mesa_glsl_parse_state *state,
        */
       if (num_vertices != 0 && var->type->length != num_vertices) {
          _mesa_glsl_error(&loc, state,
-                          "geometry shader input size contradicts previously"
-                          " declared layout (size is %u, but layout requires a"
-                          " size of %u)", var->type->length, num_vertices);
-      } else if (state->gs_input_size != 0 &&
-                 var->type->length != state->gs_input_size) {
+                          "%s size contradicts previously declared layout "
+                          "(size is %u, but layout requires a size of %u)",
+                          var_category, var->type->length, num_vertices);
+      } else if (*size != 0 && var->type->length != *size) {
          _mesa_glsl_error(&loc, state,
-                          "geometry shader input sizes are "
-                          "inconsistent (size is %u, but a previous "
-                          "declaration has size %u)",
-                          var->type->length, state->gs_input_size);
+                          "%s sizes are inconsistent (size is %u, but a "
+                          "previous declaration has size %u)",
+                          var_category, var->type->length, *size);
       } else {
-         state->gs_input_size = var->type->length;
+         *size = var->type->length;
       }
    }
 }
 
+static void
+handle_tess_ctrl_shader_output_decl(struct _mesa_glsl_parse_state *state,
+                                    YYLTYPE loc, ir_variable *var)
+{
+   unsigned num_vertices = 0;
+
+   if (state->tcs_output_vertices_specified) {
+      num_vertices = state->out_qualifier->vertices;
+   }
+
+   if (!var->type->is_array() && !var->data.patch) {
+      _mesa_glsl_error(&loc, state,
+                       "tessellation control shader outputs must be arrays");
+
+      /* To avoid cascading failures, short circuit the checks below. */
+      return;
+   }
+
+   if (var->data.patch)
+      return;
+
+   validate_layout_qualifier_vertex_count(state, loc, var, num_vertices,
+                                          &state->tcs_output_size,
+                                          "geometry shader input");
+}
+
+/**
+ * Do additional processing necessary for tessellation control/evaluation shader
+ * input declarations. This covers both interface block arrays and bare input
+ * variables.
+ */
+static void
+handle_tess_shader_input_decl(struct _mesa_glsl_parse_state *state,
+                              YYLTYPE loc, ir_variable *var)
+{
+   if (!var->type->is_array() && !var->data.patch) {
+      _mesa_glsl_error(&loc, state,
+                       "per-vertex tessellation shader inputs must be arrays");
+      /* Avoid cascading failures. */
+      return;
+   }
+
+   if (var->data.patch)
+      return;
+
+   /* Unsized arrays are implicitly sized to gl_MaxPatchVertices. */
+   if (var->type->is_unsized_array()) {
+      var->type = glsl_type::get_array_instance(var->type->fields.array,
+            state->Const.MaxPatchVertices);
+   }
+}
+
+
+/**
+ * Do additional processing necessary for geometry shader input declarations
+ * (this covers both interface blocks arrays and bare input variables).
+ */
+static void
+handle_geometry_shader_input_decl(struct _mesa_glsl_parse_state *state,
+                                  YYLTYPE loc, ir_variable *var)
+{
+   unsigned num_vertices = 0;
+
+   if (state->gs_input_prim_type_specified) {
+      num_vertices = vertices_per_prim(state->in_qualifier->prim_type);
+   }
+
+   /* Geometry shader input variables must be arrays.  Caller should have
+    * reported an error for this.
+    */
+   if (!var->type->is_array()) {
+      assert(state->error);
+
+      /* To avoid cascading failures, short circuit the checks below. */
+      return;
+   }
+
+   validate_layout_qualifier_vertex_count(state, loc, var, num_vertices,
+                                          &state->gs_input_size,
+                                          "geometry shader input");
+}
 
 void
 validate_identifier(const char *identifier, YYLTYPE loc,
@@ -3358,6 +3529,18 @@ ast_declarator_list::hir(exec_list *instructions,
 
    decl_type = this->type->glsl_type(& type_name, state);
 
+   /* Section 4.3.7 "Buffer Variables" of the GLSL 4.30 spec:
+    *    "Buffer variables may only be declared inside interface blocks
+    *    (section 4.3.9 “Interface Blocks”), which are then referred to as
+    *    shader storage blocks. It is a compile-time error to declare buffer
+    *    variables at global scope (outside a block)."
+    */
+   if (type->qualifier.flags.q.buffer && !decl_type->is_interface()) {
+      _mesa_glsl_error(&loc, state,
+                       "buffer variables cannot be declared outside "
+                       "interface blocks");
+   }
+
    /* An offset-qualified atomic counter declaration sets the default
     * offset for the next declaration within the same atomic counter
     * buffer.
@@ -3431,7 +3614,7 @@ ast_declarator_list::hir(exec_list *instructions,
    foreach_list_typed (ast_declaration, decl, link, &this->declarations) {
       const struct glsl_type *var_type;
       ir_variable *var;
-
+      const char *identifier = decl->identifier;
       /* FINISHME: Emit a warning if a variable declaration shadows a
        * FINISHME: declaration at a higher scope.
        */
@@ -3449,10 +3632,24 @@ ast_declarator_list::hir(exec_list *instructions,
          continue;
       }
 
+      if (this->type->qualifier.flags.q.subroutine) {
+         const glsl_type *t;
+         const char *name;
+
+         t = state->symbols->get_type(this->type->specifier->type_name);
+         if (!t)
+            _mesa_glsl_error(& loc, state,
+                             "invalid type in declaration of `%s'",
+                             decl->identifier);
+         name = ralloc_asprintf(ctx, "%s_%s", _mesa_shader_stage_to_subroutine_prefix(state->stage), decl->identifier);
+
+         identifier = name;
+
+      }
       var_type = process_array_type(&loc, decl_type, decl->array_specifier,
                                     state);
 
-      var = new(ctx) ir_variable(var_type, decl->identifier, ir_var_auto);
+      var = new(ctx) ir_variable(var_type, identifier, ir_var_auto);
 
       /* The 'varying in' and 'varying out' qualifiers can only be used with
        * ARB_geometry_shader4 and EXT_geometry_shader4, which we don't support
@@ -3524,6 +3721,8 @@ ast_declarator_list::hir(exec_list *instructions,
           */
          if (this->type->qualifier.flags.q.attribute) {
             mode = "attribute";
+         } else if (this->type->qualifier.flags.q.subroutine) {
+            mode = "subroutine uniform";
          } else if (this->type->qualifier.flags.q.uniform) {
             mode = "uniform";
          } else if (this->type->qualifier.flags.q.varying) {
@@ -3662,6 +3861,9 @@ ast_declarator_list::hir(exec_list *instructions,
                   }
                }
             }
+         } else if (state->stage == MESA_SHADER_TESS_CTRL ||
+                    state->stage == MESA_SHADER_TESS_EVAL) {
+            handle_tess_shader_input_decl(state, loc, var);
          }
       } else if (var->data.mode == ir_var_shader_out) {
          const glsl_type *check_type = var->type->without_array();
@@ -3757,6 +3959,13 @@ ast_declarator_list::hir(exec_list *instructions,
                }
             }
          }
+
+         if (state->stage == MESA_SHADER_TESS_CTRL) {
+            handle_tess_ctrl_shader_output_decl(state, loc, var);
+         }
+      } else if (var->type->contains_subroutine()) {
+         /* declare subroutine uniforms as hidden */
+         var->data.how_declared = ir_var_hidden;
       }
 
       /* Integer fragment inputs must be qualified with 'flat'.  In GLSL ES,
@@ -3880,6 +4089,33 @@ ast_declarator_list::hir(exec_list *instructions,
       }
 
 
+      /* From section 4.3.4 of the GLSL 4.00 spec:
+       *    "Input variables may not be declared using the patch in qualifier
+       *    in tessellation control or geometry shaders."
+       *
+       * From section 4.3.6 of the GLSL 4.00 spec:
+       *    "It is an error to use patch out in a vertex, tessellation
+       *    evaluation, or geometry shader."
+       *
+       * This doesn't explicitly forbid using them in a fragment shader, but
+       * that's probably just an oversight.
+       */
+      if (state->stage != MESA_SHADER_TESS_EVAL
+          && this->type->qualifier.flags.q.patch
+          && this->type->qualifier.flags.q.in) {
+
+         _mesa_glsl_error(&loc, state, "'patch in' can only be used in a "
+                          "tessellation evaluation shader");
+      }
+
+      if (state->stage != MESA_SHADER_TESS_CTRL
+          && this->type->qualifier.flags.q.patch
+          && this->type->qualifier.flags.q.out) {
+
+         _mesa_glsl_error(&loc, state, "'patch out' can only be used in a "
+                          "tessellation control shader");
+      }
+
       /* Precision qualifiers exists only in GLSL versions 1.00 and >= 1.30.
        */
       if (this->type->qualifier.precision != ast_precision_none) {
@@ -3891,9 +4127,7 @@ ast_declarator_list::hir(exec_list *instructions,
        * an array of that type.
        */
       if (!(this->type->qualifier.precision == ast_precision_none
-          || precision_qualifier_allowed(var->type)
-          || (var->type->is_array()
-	      && precision_qualifier_allowed(var->type->fields.array)))) {
+          || precision_qualifier_allowed(var->type->without_array()))) {
 
          _mesa_glsl_error(&loc, state,
                           "precision qualifiers apply only to floating point"
@@ -4196,6 +4430,7 @@ ast_function::hir(exec_list *instructions,
    ir_function *f = NULL;
    ir_function_signature *sig = NULL;
    exec_list hir_parameters;
+   YYLTYPE loc = this->get_location();
 
    const char *const name = identifier;
 
@@ -4247,6 +4482,17 @@ ast_function::hir(exec_list *instructions,
       return_type = glsl_type::error_type;
    }
 
+   /* ARB_shader_subroutine states:
+    *  "Subroutine declarations cannot be prototyped. It is an error to prepend
+    *   subroutine(...) to a function declaration."
+    */
+   if (this->return_type->qualifier.flags.q.subroutine_def && !is_definition) {
+      YYLTYPE loc = this->get_location();
+      _mesa_glsl_error(&loc, state,
+                       "function declaration `%s' cannot have subroutine prepended",
+                       name);
+   }
+
    /* From page 56 (page 62 of the PDF) of the GLSL 1.30 spec:
     * "No qualifier is allowed on the return type of a function."
     */
@@ -4284,15 +4530,15 @@ ast_function::hir(exec_list *instructions,
    f = state->symbols->get_function(name);
    if (f == NULL) {
       f = new(ctx) ir_function(name);
-      if (!state->symbols->add_function(f)) {
-         /* This function name shadows a non-function use of the same name. */
-         YYLTYPE loc = this->get_location();
-
-         _mesa_glsl_error(&loc, state, "function name `%s' conflicts with "
-                          "non-function", name);
-         return NULL;
+      if (!this->return_type->qualifier.flags.q.subroutine) {
+         if (!state->symbols->add_function(f)) {
+            /* This function name shadows a non-function use of the same name. */
+            YYLTYPE loc = this->get_location();
+            _mesa_glsl_error(&loc, state, "function name `%s' conflicts with "
+                             "non-function", name);
+            return NULL;
+         }
       }
-
       emit_function(state, f);
    }
 
@@ -4379,6 +4625,44 @@ ast_function::hir(exec_list *instructions,
    sig->replace_parameters(&hir_parameters);
    signature = sig;
 
+   if (this->return_type->qualifier.flags.q.subroutine_def) {
+      int idx;
+
+      f->num_subroutine_types = this->return_type->qualifier.subroutine_list->declarations.length();
+      f->subroutine_types = ralloc_array(state, const struct glsl_type *,
+                                         f->num_subroutine_types);
+      idx = 0;
+      foreach_list_typed(ast_declaration, decl, link, &this->return_type->qualifier.subroutine_list->declarations) {
+         const struct glsl_type *type;
+         /* the subroutine type must be already declared */
+         type = state->symbols->get_type(decl->identifier);
+         if (!type) {
+            _mesa_glsl_error(& loc, state, "unknown type '%s' in subroutine function definition", decl->identifier);
+         }
+         f->subroutine_types[idx++] = type;
+      }
+      state->subroutines = (ir_function **)reralloc(state, state->subroutines,
+                                                    ir_function *,
+                                                    state->num_subroutines + 1);
+      state->subroutines[state->num_subroutines] = f;
+      state->num_subroutines++;
+
+   }
+
+   if (this->return_type->qualifier.flags.q.subroutine) {
+      if (!state->symbols->add_type(this->identifier, glsl_type::get_subroutine_instance(this->identifier))) {
+         _mesa_glsl_error(& loc, state, "type '%s' previously defined", this->identifier);
+         return NULL;
+      }
+      state->subroutine_types = (ir_function **)reralloc(state, state->subroutine_types,
+                                                         ir_function *,
+                                                         state->num_subroutine_types + 1);
+      state->subroutine_types[state->num_subroutine_types] = f;
+      state->num_subroutine_types++;
+
+      f->is_subroutine = true;
+   }
+
    /* Function declarations (prototypes) do not have r-values.
     */
    return NULL;
@@ -5277,8 +5561,9 @@ ast_type_specifier::hir(exec_list *instructions,
  * \c glsl_struct_field to describe the members.
  *
  * If we're processing an interface block, var_mode should be the type of the
- * interface block (ir_var_shader_in, ir_var_shader_out, or ir_var_uniform).
- * If we're processing a structure, var_mode should be ir_var_auto.
+ * interface block (ir_var_shader_in, ir_var_shader_out, ir_var_uniform or
+ * ir_var_shader_storage).  If we're processing a structure, var_mode should be
+ * ir_var_auto.
  *
  * \return
  * The number of fields processed.  A pointer to the array structure fields is
@@ -5351,19 +5636,19 @@ ast_process_structure_or_interface_block(exec_list *instructions,
          if (is_interface && field_type->contains_opaque()) {
             YYLTYPE loc = decl_list->get_location();
             _mesa_glsl_error(&loc, state,
-                             "uniform in non-default uniform block contains "
+                             "uniform/buffer in non-default interface block contains "
                              "opaque variable");
          }
 
          if (field_type->contains_atomic()) {
-            /* FINISHME: Add a spec quotation here once updated spec
-             * FINISHME: language is available.  See Khronos bug #10903
-             * FINISHME: on whether atomic counters are allowed in
-             * FINISHME: structures.
+            /* From section 4.1.7.3 of the GLSL 4.40 spec:
+             *
+             *    "Members of structures cannot be declared as atomic counter
+             *     types."
              */
             YYLTYPE loc = decl_list->get_location();
-            _mesa_glsl_error(&loc, state, "atomic counter in structure or "
-                             "uniform block");
+            _mesa_glsl_error(&loc, state, "atomic counter in structure, "
+                             "shader storage block or uniform block");
          }
 
          if (field_type->contains_image()) {
@@ -5373,7 +5658,8 @@ ast_process_structure_or_interface_block(exec_list *instructions,
              */
             YYLTYPE loc = decl_list->get_location();
             _mesa_glsl_error(&loc, state,
-                             "image in structure or uniform block");
+                             "image in structure, shader storage block or "
+                             "uniform block");
          }
 
          const struct ast_type_qualifier *const qual =
@@ -5382,9 +5668,9 @@ ast_process_structure_or_interface_block(exec_list *instructions,
              qual->flags.q.packed ||
              qual->flags.q.shared) {
             _mesa_glsl_error(&loc, state,
-                             "uniform block layout qualifiers std140, packed, and "
-                             "shared can only be applied to uniform blocks, not "
-                             "members");
+                             "uniform/shader storage block layout qualifiers "
+                             "std140, packed, and shared can only be applied "
+                             "to uniform/shader storage blocks, not members");
          }
 
          if (qual->flags.q.constant) {
@@ -5403,15 +5689,16 @@ ast_process_structure_or_interface_block(exec_list *instructions,
             interpret_interpolation_qualifier(qual, var_mode, state, &loc);
          fields[i].centroid = qual->flags.q.centroid ? 1 : 0;
          fields[i].sample = qual->flags.q.sample ? 1 : 0;
+         fields[i].patch = qual->flags.q.patch ? 1 : 0;
 
          /* Only save explicitly defined streams in block's field */
          fields[i].stream = qual->flags.q.explicit_stream ? qual->stream : -1;
 
          if (qual->flags.q.row_major || qual->flags.q.column_major) {
-            if (!qual->flags.q.uniform) {
+            if (!qual->flags.q.uniform && !qual->flags.q.buffer) {
                _mesa_glsl_error(&loc, state,
                                 "row_major and column_major can only be "
-                                "applied to uniform interface blocks");
+                                "applied to interface blocks");
             } else
                validate_matrix_layout_for_type(state, &loc, field_type, NULL);
          }
@@ -5608,6 +5895,9 @@ ast_interface_block::hir(exec_list *instructions,
    } else if (this->layout.flags.q.uniform) {
       var_mode = ir_var_uniform;
       iface_type_name = "uniform";
+   } else if (this->layout.flags.q.buffer) {
+      var_mode = ir_var_shader_storage;
+      iface_type_name = "buffer";
    } else {
       var_mode = ir_var_auto;
       iface_type_name = "UNKNOWN";
@@ -5692,16 +5982,28 @@ ast_interface_block::hir(exec_list *instructions,
          if (ir_variable *earlier_gl_Position =
              state->symbols->get_variable("gl_Position")) {
             earlier_per_vertex = earlier_gl_Position->get_interface_type();
+         } else if (ir_variable *earlier_gl_out =
+               state->symbols->get_variable("gl_out")) {
+            earlier_per_vertex = earlier_gl_out->get_interface_type();
          } else {
             _mesa_glsl_error(&loc, state,
                              "redeclaration of gl_PerVertex output not "
                              "allowed in the %s shader",
                              _mesa_shader_stage_to_string(state->stage));
          }
-         if (this->instance_name != NULL) {
-            _mesa_glsl_error(&loc, state,
-                             "gl_PerVertex output may not be redeclared with "
-                             "an instance name");
+         if (state->stage == MESA_SHADER_TESS_CTRL) {
+            if (this->instance_name == NULL ||
+                strcmp(this->instance_name, "gl_out") != 0 || this->array_specifier == NULL) {
+               _mesa_glsl_error(&loc, state,
+                                "gl_PerVertex output must be redeclared as "
+                                "gl_out[]");
+            }
+         } else {
+            if (this->instance_name != NULL) {
+               _mesa_glsl_error(&loc, state,
+                                "gl_PerVertex output may not be redeclared with "
+                                "an instance name");
+            }
          }
          break;
       default:
@@ -5734,6 +6036,8 @@ ast_interface_block::hir(exec_list *instructions,
                earlier_per_vertex->fields.structure[j].centroid;
             fields[i].sample =
                earlier_per_vertex->fields.structure[j].sample;
+            fields[i].patch =
+               earlier_per_vertex->fields.structure[j].patch;
          }
       }
 
@@ -5787,8 +6091,18 @@ ast_interface_block::hir(exec_list *instructions,
    if (state->stage == MESA_SHADER_GEOMETRY && this->array_specifier == NULL &&
        var_mode == ir_var_shader_in) {
       _mesa_glsl_error(&loc, state, "geometry shader inputs must be arrays");
+   } else if ((state->stage == MESA_SHADER_TESS_CTRL ||
+               state->stage == MESA_SHADER_TESS_EVAL) &&
+              this->array_specifier == NULL &&
+              var_mode == ir_var_shader_in) {
+      _mesa_glsl_error(&loc, state, "per-vertex tessellation shader inputs must be arrays");
+   } else if (state->stage == MESA_SHADER_TESS_CTRL &&
+              this->array_specifier == NULL &&
+              var_mode == ir_var_shader_out) {
+      _mesa_glsl_error(&loc, state, "tessellation control shader outputs must be arrays");
    }
 
+
    /* Page 39 (page 45 of the PDF) of section 4.3.7 in the GLSL ES 3.00 spec
     * says:
     *
@@ -5834,16 +6148,39 @@ ast_interface_block::hir(exec_list *instructions,
           *     geometry shader inputs. All other input and output block
           *     arrays must specify an array size.
           *
+          * The same applies to tessellation shaders.
+          *
           * The upshot of this is that the only circumstance where an
           * interface array size *doesn't* need to be specified is on a
-          * geometry shader input.
+          * geometry shader input, tessellation control shader input,
+          * tessellation control shader output, and tessellation evaluation
+          * shader input.
           */
-         if (this->array_specifier->is_unsized_array &&
-             (state->stage != MESA_SHADER_GEOMETRY || !this->layout.flags.q.in)) {
-            _mesa_glsl_error(&loc, state,
-                             "only geometry shader inputs may be unsized "
-                             "instance block arrays");
+         if (this->array_specifier->is_unsized_array) {
+            bool allow_inputs = state->stage == MESA_SHADER_GEOMETRY ||
+                                state->stage == MESA_SHADER_TESS_CTRL ||
+                                state->stage == MESA_SHADER_TESS_EVAL;
+            bool allow_outputs = state->stage == MESA_SHADER_TESS_CTRL;
 
+            if (this->layout.flags.q.in) {
+               if (!allow_inputs)
+                  _mesa_glsl_error(&loc, state,
+                                   "unsized input block arrays not allowed in "
+                                   "%s shader",
+                                   _mesa_shader_stage_to_string(state->stage));
+            } else if (this->layout.flags.q.out) {
+               if (!allow_outputs)
+                  _mesa_glsl_error(&loc, state,
+                                   "unsized output block arrays not allowed in "
+                                   "%s shader",
+                                   _mesa_shader_stage_to_string(state->stage));
+            } else {
+               /* by elimination, this is a uniform block array */
+               _mesa_glsl_error(&loc, state,
+                                "unsized uniform block arrays not allowed in "
+                                "%s shader",
+                                _mesa_shader_stage_to_string(state->stage));
+            }
          }
 
          const glsl_type *block_array_type =
@@ -5877,6 +6214,11 @@ ast_interface_block::hir(exec_list *instructions,
 
       if (state->stage == MESA_SHADER_GEOMETRY && var_mode == ir_var_shader_in)
          handle_geometry_shader_input_decl(state, loc, var);
+      else if ((state->stage == MESA_SHADER_TESS_CTRL ||
+           state->stage == MESA_SHADER_TESS_EVAL) && var_mode == ir_var_shader_in)
+         handle_tess_shader_input_decl(state, loc, var);
+      else if (state->stage == MESA_SHADER_TESS_CTRL && var_mode == ir_var_shader_out)
+         handle_tess_ctrl_shader_output_decl(state, loc, var);
 
       if (ir_variable *earlier =
           state->symbols->get_variable(this->instance_name)) {
@@ -5917,6 +6259,7 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.interpolation = fields[i].interpolation;
          var->data.centroid = fields[i].centroid;
          var->data.sample = fields[i].sample;
+         var->data.patch = fields[i].patch;
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
@@ -5965,8 +6308,8 @@ ast_interface_block::hir(exec_list *instructions,
          if (state->symbols->get_variable(var->name) != NULL)
             _mesa_glsl_error(&loc, state, "`%s' redeclared", var->name);
 
-         /* Propagate the "binding" keyword into this UBO's fields;
-          * the UBO declaration itself doesn't get an ir_variable unless it
+         /* Propagate the "binding" keyword into this UBO/SSBO's fields.
+          * The UBO declaration itself doesn't get an ir_variable unless it
           * has an instance name.  This is ugly.
           */
          var->data.explicit_binding = this->layout.flags.q.explicit_binding;
@@ -6024,6 +6367,67 @@ ast_interface_block::hir(exec_list *instructions,
 }
 
 
+ir_rvalue *
+ast_tcs_output_layout::hir(exec_list *instructions,
+			  struct _mesa_glsl_parse_state *state)
+{
+   YYLTYPE loc = this->get_location();
+
+   /* If any tessellation control output layout declaration preceded this
+    * one, make sure it was consistent with this one.
+    */
+   if (state->tcs_output_vertices_specified &&
+       state->out_qualifier->vertices != this->vertices) {
+      _mesa_glsl_error(&loc, state,
+		       "tessellation control shader output layout does not "
+		       "match previous declaration");
+      return NULL;
+   }
+
+   /* If any shader outputs occurred before this declaration and specified an
+    * array size, make sure the size they specified is consistent with the
+    * primitive type.
+    */
+   unsigned num_vertices = this->vertices;
+   if (state->tcs_output_size != 0 && state->tcs_output_size != num_vertices) {
+      _mesa_glsl_error(&loc, state,
+		       "this tessellation control shader output layout "
+		       "specifies %u vertices, but a previous output "
+		       "is declared with size %u",
+		       num_vertices, state->tcs_output_size);
+      return NULL;
+   }
+
+   state->tcs_output_vertices_specified = true;
+
+   /* If any shader outputs occurred before this declaration and did not
+    * specify an array size, their size is determined now.
+    */
+   foreach_in_list (ir_instruction, node, instructions) {
+      ir_variable *var = node->as_variable();
+      if (var == NULL || var->data.mode != ir_var_shader_out)
+	 continue;
+
+      /* Note: Not all tessellation control shader output are arrays. */
+      if (!var->type->is_unsized_array() || var->data.patch)
+         continue;
+
+      if (var->data.max_array_access >= num_vertices) {
+	 _mesa_glsl_error(&loc, state,
+			  "this tessellation control shader output layout "
+			  "specifies %u vertices, but an access to element "
+			  "%u of output `%s' already exists", num_vertices,
+			  var->data.max_array_access, var->name);
+      } else {
+	 var->type = glsl_type::get_array_instance(var->type->fields.array,
+						   num_vertices);
+      }
+   }
+
+   return NULL;
+}
+
+
 ir_rvalue *
 ast_gs_input_layout::hir(exec_list *instructions,
                          struct _mesa_glsl_parse_state *state)
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 5eb2913d6b7..892122af03d 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -40,7 +40,12 @@ ast_type_specifier::print(void) const
 bool
 ast_fully_specified_type::has_qualifiers() const
 {
-   return this->qualifier.flags.i != 0;
+   /* 'subroutine' isnt a real qualifier. */
+   ast_type_qualifier subroutine_only;
+   subroutine_only.flags.i = 0;
+   subroutine_only.flags.q.subroutine = 1;
+   subroutine_only.flags.q.subroutine_def = 1;
+   return (this->qualifier.flags.i & ~subroutine_only.flags.i) != 0;
 }
 
 bool ast_type_qualifier::has_interpolation() const
@@ -78,14 +83,16 @@ ast_type_qualifier::has_storage() const
           || this->flags.q.varying
           || this->flags.q.in
           || this->flags.q.out
-          || this->flags.q.uniform;
+          || this->flags.q.uniform
+          || this->flags.q.buffer;
 }
 
 bool
 ast_type_qualifier::has_auxiliary_storage() const
 {
    return this->flags.q.centroid
-          || this->flags.q.sample;
+          || this->flags.q.sample
+          || this->flags.q.patch;
 }
 
 const char*
@@ -211,6 +218,44 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
       }
    }
 
+   if (q.flags.q.vertices) {
+      if (this->flags.q.vertices && this->vertices != q.vertices) {
+	 _mesa_glsl_error(loc, state,
+			  "tessellation control shader set conflicting "
+			  "vertices (%d and %d)",
+			  this->vertices, q.vertices);
+	 return false;
+      }
+      this->vertices = q.vertices;
+   }
+
+   if (q.flags.q.vertex_spacing) {
+      if (this->flags.q.vertex_spacing && this->vertex_spacing != q.vertex_spacing) {
+	 _mesa_glsl_error(loc, state,
+			  "conflicting vertex spacing used");
+	 return false;
+      }
+      this->vertex_spacing = q.vertex_spacing;
+   }
+
+   if (q.flags.q.ordering) {
+      if (this->flags.q.ordering && this->ordering != q.ordering) {
+	 _mesa_glsl_error(loc, state,
+			  "conflicting ordering used");
+	 return false;
+      }
+      this->ordering = q.ordering;
+   }
+
+   if (q.flags.q.point_mode) {
+      if (this->flags.q.point_mode && this->point_mode != q.point_mode) {
+	 _mesa_glsl_error(loc, state,
+			  "conflicting point mode used");
+	 return false;
+      }
+      this->point_mode = q.point_mode;
+   }
+
    if ((q.flags.i & ubo_mat_mask.flags.i) != 0)
       this->flags.i &= ~ubo_mat_mask.flags.i;
    if ((q.flags.i & ubo_layout_mask.flags.i) != 0)
@@ -260,6 +305,22 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
    return true;
 }
 
+bool
+ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
+                                        _mesa_glsl_parse_state *state,
+                                        ast_type_qualifier q,
+                                        ast_node* &node)
+{
+   void *mem_ctx = state;
+   const bool r = this->merge_qualifier(loc, state, q);
+
+   if (state->stage == MESA_SHADER_TESS_CTRL) {
+      node = new(mem_ctx) ast_tcs_output_layout(*loc, q.vertices);
+   }
+
+   return r;
+}
+
 bool
 ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
                                        _mesa_glsl_parse_state *state,
@@ -273,6 +334,27 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
    valid_in_mask.flags.i = 0;
 
    switch (state->stage) {
+   case MESA_SHADER_TESS_EVAL:
+      if (q.flags.q.prim_type) {
+         /* Make sure this is a valid input primitive type. */
+         switch (q.prim_type) {
+         case GL_TRIANGLES:
+         case GL_QUADS:
+         case GL_ISOLINES:
+            break;
+         default:
+            _mesa_glsl_error(loc, state,
+                             "invalid tessellation evaluation "
+                             "shader input primitive type");
+            break;
+         }
+      }
+
+      valid_in_mask.flags.q.prim_type = 1;
+      valid_in_mask.flags.q.vertex_spacing = 1;
+      valid_in_mask.flags.q.ordering = 1;
+      valid_in_mask.flags.q.point_mode = 1;
+      break;
    case MESA_SHADER_GEOMETRY:
       if (q.flags.q.prim_type) {
          /* Make sure this is a valid input primitive type. */
@@ -328,7 +410,9 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
       if (q.flags.q.prim_type &&
           this->prim_type != q.prim_type) {
          _mesa_glsl_error(loc, state,
-                          "conflicting input primitive types specified");
+                          "conflicting input primitive %s specified",
+                          state->stage == MESA_SHADER_GEOMETRY ?
+                          "type" : "mode");
       }
    } else if (q.flags.q.prim_type) {
       state->in_qualifier->flags.q.prim_type = 1;
@@ -350,6 +434,39 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
       state->fs_early_fragment_tests = true;
    }
 
+   if (this->flags.q.vertex_spacing) {
+      if (q.flags.q.vertex_spacing &&
+          this->vertex_spacing != q.vertex_spacing) {
+         _mesa_glsl_error(loc, state,
+                          "conflicting vertex spacing specified");
+      }
+   } else if (q.flags.q.vertex_spacing) {
+      this->flags.q.vertex_spacing = 1;
+      this->vertex_spacing = q.vertex_spacing;
+   }
+
+   if (this->flags.q.ordering) {
+      if (q.flags.q.ordering &&
+          this->ordering != q.ordering) {
+         _mesa_glsl_error(loc, state,
+                          "conflicting ordering specified");
+      }
+   } else if (q.flags.q.ordering) {
+      this->flags.q.ordering = 1;
+      this->ordering = q.ordering;
+   }
+
+   if (this->flags.q.point_mode) {
+      if (q.flags.q.point_mode &&
+          this->point_mode != q.point_mode) {
+         _mesa_glsl_error(loc, state,
+                          "conflicting point mode specified");
+      }
+   } else if (q.flags.q.point_mode) {
+      this->flags.q.point_mode = 1;
+      this->point_mode = q.point_mode;
+   }
+
    if (create_gs_ast) {
       node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
    } else if (create_cs_ast) {
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index efab2991993..2175c66cbd7 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -413,8 +413,8 @@ fp64(const _mesa_glsl_parse_state *state)
 static bool
 barrier_supported(const _mesa_glsl_parse_state *state)
 {
-   return state->stage == MESA_SHADER_COMPUTE;
-   /* TODO: || stage->state == MESA_SHADER_TESS_CTRL; */
+   return state->stage == MESA_SHADER_COMPUTE ||
+          state->stage == MESA_SHADER_TESS_CTRL;
 }
 
 /** @} */
diff --git a/src/glsl/builtin_types.cpp b/src/glsl/builtin_types.cpp
index d92e2eb3007..ffbc5e6fdbc 100644
--- a/src/glsl/builtin_types.cpp
+++ b/src/glsl/builtin_types.cpp
@@ -54,64 +54,64 @@
       &glsl_type::_struct_##NAME##_type;
 
 static const struct glsl_struct_field gl_DepthRangeParameters_fields[] = {
-   { glsl_type::float_type, "near", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "far",  -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "diff", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::float_type, "near"),
+   glsl_struct_field(glsl_type::float_type, "far"),
+   glsl_struct_field(glsl_type::float_type, "diff"),
 };
 
 static const struct glsl_struct_field gl_PointParameters_fields[] = {
-   { glsl_type::float_type, "size", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "sizeMin", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "sizeMax", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "fadeThresholdSize", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "distanceConstantAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "distanceLinearAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "distanceQuadraticAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::float_type, "size"),
+   glsl_struct_field(glsl_type::float_type, "sizeMin"),
+   glsl_struct_field(glsl_type::float_type, "sizeMax"),
+   glsl_struct_field(glsl_type::float_type, "fadeThresholdSize"),
+   glsl_struct_field(glsl_type::float_type, "distanceConstantAttenuation"),
+   glsl_struct_field(glsl_type::float_type, "distanceLinearAttenuation"),
+   glsl_struct_field(glsl_type::float_type, "distanceQuadraticAttenuation"),
 };
 
 static const struct glsl_struct_field gl_MaterialParameters_fields[] = {
-   { glsl_type::vec4_type, "emission", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "ambient", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "diffuse", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "specular", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "shininess", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "emission"),
+   glsl_struct_field(glsl_type::vec4_type, "ambient"),
+   glsl_struct_field(glsl_type::vec4_type, "diffuse"),
+   glsl_struct_field(glsl_type::vec4_type, "specular"),
+   glsl_struct_field(glsl_type::float_type, "shininess"),
 };
 
 static const struct glsl_struct_field gl_LightSourceParameters_fields[] = {
-   { glsl_type::vec4_type, "ambient", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "diffuse", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "specular", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "position", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "halfVector", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec3_type, "spotDirection", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "spotExponent", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "spotCutoff", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "spotCosCutoff", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "constantAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "linearAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "quadraticAttenuation", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "ambient"),
+   glsl_struct_field(glsl_type::vec4_type, "diffuse"),
+   glsl_struct_field(glsl_type::vec4_type, "specular"),
+   glsl_struct_field(glsl_type::vec4_type, "position"),
+   glsl_struct_field(glsl_type::vec4_type, "halfVector"),
+   glsl_struct_field(glsl_type::vec3_type, "spotDirection"),
+   glsl_struct_field(glsl_type::float_type, "spotExponent"),
+   glsl_struct_field(glsl_type::float_type, "spotCutoff"),
+   glsl_struct_field(glsl_type::float_type, "spotCosCutoff"),
+   glsl_struct_field(glsl_type::float_type, "constantAttenuation"),
+   glsl_struct_field(glsl_type::float_type, "linearAttenuation"),
+   glsl_struct_field(glsl_type::float_type, "quadraticAttenuation"),
 };
 
 static const struct glsl_struct_field gl_LightModelParameters_fields[] = {
-   { glsl_type::vec4_type, "ambient", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "ambient"),
 };
 
 static const struct glsl_struct_field gl_LightModelProducts_fields[] = {
-   { glsl_type::vec4_type, "sceneColor", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "sceneColor"),
 };
 
 static const struct glsl_struct_field gl_LightProducts_fields[] = {
-   { glsl_type::vec4_type, "ambient", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "diffuse", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::vec4_type, "specular", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "ambient"),
+   glsl_struct_field(glsl_type::vec4_type, "diffuse"),
+   glsl_struct_field(glsl_type::vec4_type, "specular"),
 };
 
 static const struct glsl_struct_field gl_FogParameters_fields[] = {
-   { glsl_type::vec4_type, "color", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "density", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "start", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "end", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
-   { glsl_type::float_type, "scale", -1, 0, 0, 0, GLSL_MATRIX_LAYOUT_INHERITED, 0 },
+   glsl_struct_field(glsl_type::vec4_type, "color"),
+   glsl_struct_field(glsl_type::float_type, "density"),
+   glsl_struct_field(glsl_type::float_type, "start"),
+   glsl_struct_field(glsl_type::float_type, "end"),
+   glsl_struct_field(glsl_type::float_type, "scale"),
 };
 
 #include "builtin_type_macros.h"
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index a765d35fde0..53d3500b1f4 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -322,6 +322,7 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
    this->fields[this->num_fields].interpolation = INTERP_QUALIFIER_NONE;
    this->fields[this->num_fields].centroid = 0;
    this->fields[this->num_fields].sample = 0;
+   this->fields[this->num_fields].patch = 0;
    this->num_fields++;
 }
 
@@ -343,6 +344,8 @@ public:
    void generate_constants();
    void generate_uniforms();
    void generate_vs_special_vars();
+   void generate_tcs_special_vars();
+   void generate_tes_special_vars();
    void generate_gs_special_vars();
    void generate_fs_special_vars();
    void generate_cs_special_vars();
@@ -436,11 +439,12 @@ builtin_variable_generator::add_variable(const char *name,
       var->data.read_only = true;
       break;
    case ir_var_shader_out:
+   case ir_var_shader_storage:
       break;
    default:
       /* The only variables that are added using this function should be
-       * uniforms, shader inputs, and shader outputs, constants (which use
-       * ir_var_auto), and system values.
+       * uniforms, shader storage, shader inputs, and shader outputs, constants
+       * (which use ir_var_auto), and system values.
        */
       assert(0);
       break;
@@ -669,8 +673,14 @@ builtin_variable_generator::generate_constants()
       if (!state->es_shader) {
          add_const("gl_MaxGeometryAtomicCounters",
                    state->Const.MaxGeometryAtomicCounters);
-         add_const("gl_MaxTessControlAtomicCounters", 0);
-         add_const("gl_MaxTessEvaluationAtomicCounters", 0);
+
+	 if (state->is_version(400, 0) ||
+             state->ARB_tessellation_shader_enable) {
+		 add_const("gl_MaxTessControlAtomicCounters",
+                           state->Const.MaxTessControlAtomicCounters);
+		 add_const("gl_MaxTessEvaluationAtomicCounters",
+                           state->Const.MaxTessEvaluationAtomicCounters);
+	 }
       }
    }
 
@@ -690,8 +700,10 @@ builtin_variable_generator::generate_constants()
       if (!state->es_shader) {
          add_const("gl_MaxGeometryAtomicCounterBuffers",
                    state->Const.MaxGeometryAtomicCounterBuffers);
-         add_const("gl_MaxTessControlAtomicCounterBuffers", 0);
-         add_const("gl_MaxTessEvaluationAtomicCounterBuffers", 0);
+         add_const("gl_MaxTessControlAtomicCounterBuffers",
+                   state->Const.MaxTessControlAtomicCounterBuffers);
+         add_const("gl_MaxTessEvaluationAtomicCounterBuffers",
+                   state->Const.MaxTessEvaluationAtomicCounterBuffers);
       }
    }
 
@@ -750,11 +762,35 @@ builtin_variable_generator::generate_constants()
                 state->Const.MaxFragmentImageUniforms);
       add_const("gl_MaxCombinedImageUniforms",
                 state->Const.MaxCombinedImageUniforms);
+
+      if (state->is_version(400, 0) ||
+          state->ARB_tessellation_shader_enable) {
+         add_const("gl_MaxTessControlImageUniforms",
+                   state->Const.MaxTessControlImageUniforms);
+         add_const("gl_MaxTessEvaluationImageUniforms",
+                   state->Const.MaxTessEvaluationImageUniforms);
+      }
    }
 
    if (state->is_version(410, 0) ||
        state->ARB_viewport_array_enable)
       add_const("gl_MaxViewports", state->Const.MaxViewports);
+
+   if (state->is_version(400, 0) ||
+       state->ARB_tessellation_shader_enable) {
+      add_const("gl_MaxPatchVertices", state->Const.MaxPatchVertices);
+      add_const("gl_MaxTessGenLevel", state->Const.MaxTessGenLevel);
+      add_const("gl_MaxTessControlInputComponents", state->Const.MaxTessControlInputComponents);
+      add_const("gl_MaxTessControlOutputComponents", state->Const.MaxTessControlOutputComponents);
+      add_const("gl_MaxTessControlTextureImageUnits", state->Const.MaxTessControlTextureImageUnits);
+      add_const("gl_MaxTessEvaluationInputComponents", state->Const.MaxTessEvaluationInputComponents);
+      add_const("gl_MaxTessEvaluationOutputComponents", state->Const.MaxTessEvaluationOutputComponents);
+      add_const("gl_MaxTessEvaluationTextureImageUnits", state->Const.MaxTessEvaluationTextureImageUnits);
+      add_const("gl_MaxTessPatchComponents", state->Const.MaxTessPatchComponents);
+      add_const("gl_MaxTessControlTotalOutputComponents", state->Const.MaxTessControlTotalOutputComponents);
+      add_const("gl_MaxTessControlUniformComponents", state->Const.MaxTessControlUniformComponents);
+      add_const("gl_MaxTessEvaluationUniformComponents", state->Const.MaxTessEvaluationUniformComponents);
+   }
 }
 
 
@@ -870,6 +906,39 @@ builtin_variable_generator::generate_vs_special_vars()
 }
 
 
+/**
+ * Generate variables which only exist in tessellation control shaders.
+ */
+void
+builtin_variable_generator::generate_tcs_special_vars()
+{
+   add_system_value(SYSTEM_VALUE_PRIMITIVE_ID, int_t, "gl_PrimitiveID");
+   add_system_value(SYSTEM_VALUE_VERTICES_IN, int_t, "gl_PatchVerticesIn");
+   add_system_value(SYSTEM_VALUE_INVOCATION_ID, int_t, "gl_InvocationID");
+
+   add_output(VARYING_SLOT_TESS_LEVEL_OUTER, array(float_t, 4),
+              "gl_TessLevelOuter")->data.patch = 1;
+   add_output(VARYING_SLOT_TESS_LEVEL_INNER, array(float_t, 2),
+              "gl_TessLevelInner")->data.patch = 1;
+}
+
+
+/**
+ * Generate variables which only exist in tessellation evaluation shaders.
+ */
+void
+builtin_variable_generator::generate_tes_special_vars()
+{
+   add_system_value(SYSTEM_VALUE_PRIMITIVE_ID, int_t, "gl_PrimitiveID");
+   add_system_value(SYSTEM_VALUE_VERTICES_IN, int_t, "gl_PatchVerticesIn");
+   add_system_value(SYSTEM_VALUE_TESS_COORD, vec3_t, "gl_TessCoord");
+   add_system_value(SYSTEM_VALUE_TESS_LEVEL_OUTER, array(float_t, 4),
+                    "gl_TessLevelOuter");
+   add_system_value(SYSTEM_VALUE_TESS_LEVEL_INNER, array(float_t, 2),
+                    "gl_TessLevelInner");
+}
+
+
 /**
  * Generate variables which only exist in geometry shaders.
  */
@@ -993,6 +1062,8 @@ builtin_variable_generator::add_varying(int slot, const glsl_type *type,
                                         const char *name_as_gs_input)
 {
    switch (state->stage) {
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       this->per_vertex_in.add_field(slot, type, name);
       /* FALLTHROUGH */
@@ -1045,13 +1116,40 @@ builtin_variable_generator::generate_varyings()
       }
    }
 
+   /* Section 7.1 (Built-In Language Variables) of the GLSL 4.00 spec
+    * says:
+    *
+    *    "In the tessellation control language, built-in variables are
+    *    intrinsically declared as:
+    *
+    *        in gl_PerVertex {
+    *            vec4 gl_Position;
+    *            float gl_PointSize;
+    *            float gl_ClipDistance[];
+    *        } gl_in[gl_MaxPatchVertices];"
+    */
+   if (state->stage == MESA_SHADER_TESS_CTRL ||
+       state->stage == MESA_SHADER_TESS_EVAL) {
+      const glsl_type *per_vertex_in_type =
+         this->per_vertex_in.construct_interface_instance();
+      add_variable("gl_in", array(per_vertex_in_type, state->Const.MaxPatchVertices),
+                   ir_var_shader_in, -1);
+   }
    if (state->stage == MESA_SHADER_GEOMETRY) {
       const glsl_type *per_vertex_in_type =
          this->per_vertex_in.construct_interface_instance();
       add_variable("gl_in", array(per_vertex_in_type, 0),
                    ir_var_shader_in, -1);
    }
-   if (state->stage == MESA_SHADER_VERTEX || state->stage == MESA_SHADER_GEOMETRY) {
+   if (state->stage == MESA_SHADER_TESS_CTRL) {
+      const glsl_type *per_vertex_out_type =
+         this->per_vertex_out.construct_interface_instance();
+      add_variable("gl_out", array(per_vertex_out_type, 0),
+                   ir_var_shader_out, -1);
+   }
+   if (state->stage == MESA_SHADER_VERTEX ||
+       state->stage == MESA_SHADER_TESS_EVAL ||
+       state->stage == MESA_SHADER_GEOMETRY) {
       const glsl_type *per_vertex_out_type =
          this->per_vertex_out.construct_interface_instance();
       const glsl_struct_field *fields = per_vertex_out_type->fields.structure;
@@ -1062,6 +1160,7 @@ builtin_variable_generator::generate_varyings()
          var->data.interpolation = fields[i].interpolation;
          var->data.centroid = fields[i].centroid;
          var->data.sample = fields[i].sample;
+         var->data.patch = fields[i].patch;
          var->init_interface_type(per_vertex_out_type);
       }
    }
@@ -1086,6 +1185,12 @@ _mesa_glsl_initialize_variables(exec_list *instructions,
    case MESA_SHADER_VERTEX:
       gen.generate_vs_special_vars();
       break;
+   case MESA_SHADER_TESS_CTRL:
+      gen.generate_tcs_special_vars();
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      gen.generate_tes_special_vars();
+      break;
    case MESA_SHADER_GEOMETRY:
       gen.generate_gs_special_vars();
       break;
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index a11b6b2c7c8..dd5ec2a30b5 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -1074,9 +1074,9 @@ _token_list_equal_ignoring_space (token_list_t *a, token_list_t *b)
 		 */
 		if (node_a->token->type == SPACE
 		    && node_b->token->type == SPACE) {
-			while (node_a->token->type == SPACE)
+			while (node_a && node_a->token->type == SPACE)
 				node_a = node_a->next;
-			while (node_b->token->type == SPACE)
+			while (node_b && node_b->token->type == SPACE)
 				node_b = node_b->next;
 			continue;
 		}
@@ -2483,6 +2483,15 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 
               if (extensions->ARB_shader_precision)
                  add_builtin_define(parser, "GL_ARB_shader_precision", 1);
+
+	      if (extensions->ARB_shader_storage_buffer_object)
+	         add_builtin_define(parser, "GL_ARB_shader_storage_buffer_object", 1);
+
+	      if (extensions->ARB_tessellation_shader)
+	         add_builtin_define(parser, "GL_ARB_tessellation_shader", 1);
+
+              if (extensions->ARB_shader_subroutine)
+                 add_builtin_define(parser, "GL_ARB_shader_subroutine", 1);
 	   }
 	}
 
diff --git a/src/glsl/glcpp/glcpp.c b/src/glsl/glcpp/glcpp.c
index 5144516a69c..c62f4efec9d 100644
--- a/src/glsl/glcpp/glcpp.c
+++ b/src/glsl/glcpp/glcpp.c
@@ -29,6 +29,7 @@
 #include "glcpp.h"
 #include "main/mtypes.h"
 #include "main/shaderobj.h"
+#include "util/strtod.h"
 
 extern int glcpp_parser_debug;
 
@@ -168,6 +169,8 @@ main (int argc, char *argv[])
 	if (shader == NULL)
 	   return 1;
 
+	_mesa_locale_init();
+
 	ret = glcpp_preprocess(ctx, &shader, &info_log, NULL, &gl_ctx);
 
 	printf("%s", shader);
diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index 10db5b8b632..efa0bb68099 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -308,12 +308,14 @@ in		return IN_TOK;
 out		return OUT_TOK;
 inout		return INOUT_TOK;
 uniform		return UNIFORM;
+buffer		return BUFFER;
 varying		DEPRECATED_ES_KEYWORD(VARYING);
 centroid	KEYWORD(120, 300, 120, 300, CENTROID);
 invariant	KEYWORD(120, 100, 120, 100, INVARIANT);
 flat		KEYWORD(130, 100, 130, 300, FLAT);
 smooth		KEYWORD(130, 300, 130, 300, SMOOTH);
 noperspective	KEYWORD(130, 300, 130, 0, NOPERSPECTIVE);
+patch		KEYWORD_WITH_ALT(0, 300, 400, 0, yyextra->ARB_tessellation_shader_enable, PATCH);
 
 sampler1D	DEPRECATED_ES_KEYWORD(SAMPLER1D);
 sampler2D	return SAMPLER2D;
@@ -424,7 +426,8 @@ layout		{
 		      || yyextra->ARB_uniform_buffer_object_enable
 		      || yyextra->ARB_fragment_coord_conventions_enable
                       || yyextra->ARB_shading_language_420pack_enable
-                      || yyextra->ARB_compute_shader_enable) {
+                      || yyextra->ARB_compute_shader_enable
+                      || yyextra->ARB_tessellation_shader_enable) {
 		      return LAYOUT_TOK;
 		   } else {
 		      void *mem_ctx = yyextra;
@@ -575,9 +578,8 @@ usamplerBuffer	KEYWORD(140, 300, 140, 0, USAMPLERBUFFER);
 
     /* Additional reserved words in GLSL ES 3.00 */
 resource	KEYWORD(0, 300, 0, 0, RESOURCE);
-patch		KEYWORD(0, 300, 0, 0, PATCH);
 sample		KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_gpu_shader5_enable, SAMPLE);
-subroutine	KEYWORD(0, 300, 0, 0, SUBROUTINE);
+subroutine	KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_enable, SUBROUTINE);
 
 
 [_a-zA-Z][_a-zA-Z0-9]*	{
@@ -593,6 +595,10 @@ subroutine	KEYWORD(0, 300, 0, 0, SUBROUTINE);
 			    return classify_identifier(state, yytext);
 			}
 
+\.			{ struct _mesa_glsl_parse_state *state = yyextra;
+			  state->is_field = true;
+			  return DOT_TOK; }
+
 .			{ return yytext[0]; }
 
 %%
@@ -600,6 +606,10 @@ subroutine	KEYWORD(0, 300, 0, 0, SUBROUTINE);
 int
 classify_identifier(struct _mesa_glsl_parse_state *state, const char *name)
 {
+   if (state->is_field) {
+      state->is_field = false;
+      return FIELD_SELECTION;
+   }
    if (state->symbols->get_variable(name) || state->symbols->get_function(name))
       return IDENTIFIER;
    else if (state->symbols->get_type(name))
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 05fa4ea9ac5..97648c15ccc 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -121,7 +121,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
    ast_case_statement *case_statement;
    ast_case_statement_list *case_statement_list;
    ast_interface_block *interface_block;
-
+   ast_subroutine_list *subroutine_list;
    struct {
       ast_node *cond;
       ast_expression *rest;
@@ -134,7 +134,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 }
 
 %token ATTRIBUTE CONST_TOK BOOL_TOK FLOAT_TOK INT_TOK UINT_TOK DOUBLE_TOK
-%token BREAK CONTINUE DO ELSE FOR IF DISCARD RETURN SWITCH CASE DEFAULT
+%token BREAK BUFFER CONTINUE DO ELSE FOR IF DISCARD RETURN SWITCH CASE DEFAULT
 %token BVEC2 BVEC3 BVEC4 IVEC2 IVEC3 IVEC4 UVEC2 UVEC3 UVEC4 VEC2 VEC3 VEC4 DVEC2 DVEC3 DVEC4
 %token CENTROID IN_TOK OUT_TOK INOUT_TOK UNIFORM VARYING SAMPLE
 %token NOPERSPECTIVE FLAT SMOOTH
@@ -186,7 +186,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %token PRAGMA_OPTIMIZE_ON PRAGMA_OPTIMIZE_OFF
 %token PRAGMA_INVARIANT_ALL
 %token LAYOUT_TOK
-
+%token DOT_TOK
    /* Reserved words that are not actually used in the grammar.
     */
 %token ASM CLASS UNION ENUM TYPEDEF TEMPLATE THIS PACKED_TOK GOTO
@@ -215,6 +215,8 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <type_qualifier> layout_qualifier_id_list layout_qualifier_id
 %type <type_qualifier> interface_block_layout_qualifier
 %type <type_qualifier> memory_qualifier
+%type <type_qualifier> subroutine_qualifier
+%type <subroutine_list> subroutine_type_list
 %type <type_qualifier> interface_qualifier
 %type <type_specifier> type_specifier
 %type <type_specifier> type_specifier_nonarray
@@ -260,10 +262,6 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <expression> function_call_generic
 %type <expression> function_call_or_method
 %type <expression> function_call
-%type <expression> method_call_generic
-%type <expression> method_call_header_with_parameters
-%type <expression> method_call_header_no_parameters
-%type <expression> method_call_header
 %type <n> assignment_operator
 %type <n> unary_operator
 %type <expression> function_identifier
@@ -476,7 +474,7 @@ postfix_expression:
    {
       $$ = $1;
    }
-   | postfix_expression '.' any_identifier
+   | postfix_expression DOT_TOK FIELD_SELECTION
    {
       void *ctx = state;
       $$ = new(ctx) ast_expression(ast_field_selection, $1, NULL, NULL);
@@ -507,12 +505,6 @@ function_call:
 
 function_call_or_method:
    function_call_generic
-   | postfix_expression '.' method_call_generic
-   {
-      void *ctx = state;
-      $$ = new(ctx) ast_expression(ast_field_selection, $1, $3, NULL);
-      $$->set_location_range(@1, @3);
-   }
    ;
 
 function_call_generic:
@@ -554,62 +546,17 @@ function_identifier:
       $$ = new(ctx) ast_function_expression($1);
       $$->set_location(@1);
       }
-   | variable_identifier
+   | postfix_expression
    {
       void *ctx = state;
-      ast_expression *callee = new(ctx) ast_expression($1);
-      callee->set_location(@1);
-      $$ = new(ctx) ast_function_expression(callee);
+      $$ = new(ctx) ast_function_expression($1);
       $$->set_location(@1);
       }
-   | FIELD_SELECTION
-   {
-      void *ctx = state;
-      ast_expression *callee = new(ctx) ast_expression($1);
-      callee->set_location(@1);
-      $$ = new(ctx) ast_function_expression(callee);
-      $$->set_location(@1);
-      }
-   ;
-
-method_call_generic:
-   method_call_header_with_parameters ')'
-   | method_call_header_no_parameters ')'
-   ;
-
-method_call_header_no_parameters:
-   method_call_header VOID_TOK
-   | method_call_header
-   ;
-
-method_call_header_with_parameters:
-   method_call_header assignment_expression
-   {
-      $$ = $1;
-      $$->set_location(@1);
-      $$->expressions.push_tail(& $2->link);
-   }
-   | method_call_header_with_parameters ',' assignment_expression
-   {
-      $$ = $1;
-      $$->set_location(@1);
-      $$->expressions.push_tail(& $3->link);
-   }
    ;
 
    // Grammar Note: Constructors look like methods, but lexical
    // analysis recognized most of them as keywords. They are now
    // recognized through "type_specifier".
-method_call_header:
-   variable_identifier '('
-   {
-      void *ctx = state;
-      ast_expression *callee = new(ctx) ast_expression($1);
-      callee->set_location(@1);
-      $$ = new(ctx) ast_function_expression(callee);
-      $$->set_location(@1);
-   }
-   ;
 
    // Grammar Note: No traditional style type casts.
 unary_expression:
@@ -910,7 +857,11 @@ function_header:
       $$->return_type = $1;
       $$->identifier = $2;
 
-      state->symbols->add_function(new(state) ir_function($2));
+      if ($1->qualifier.flags.q.subroutine) {
+         /* add type for IDENTIFIER search */
+         state->symbols->add_type($2, glsl_type::get_subroutine_instance($2));
+      } else
+         state->symbols->add_function(new(state) ir_function($2));
       state->symbols->push_scope();
    }
    ;
@@ -983,7 +934,7 @@ parameter_qualifier:
       if (($1.flags.q.in || $1.flags.q.out) && ($2.flags.q.in || $2.flags.q.out))
          _mesa_glsl_error(&@1, state, "duplicate in/out/inout qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable && $2.flags.q.constant)
+      if (!state->has_420pack() && $2.flags.q.constant)
          _mesa_glsl_error(&@1, state, "in/out/inout must come after const "
                                       "or precise");
 
@@ -995,7 +946,7 @@ parameter_qualifier:
       if ($2.precision != ast_precision_none)
          _mesa_glsl_error(&@1, state, "duplicate precision qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable && $2.flags.i != 0)
+      if (!state->has_420pack() && $2.flags.i != 0)
          _mesa_glsl_error(&@1, state, "precision qualifiers must come last");
 
       $$ = $2;
@@ -1215,7 +1166,8 @@ layout_qualifier_id:
       /* Layout qualifiers for AMD/ARB_conservative_depth. */
       if (!$$.flags.i &&
           (state->AMD_conservative_depth_enable ||
-           state->ARB_conservative_depth_enable)) {
+           state->ARB_conservative_depth_enable ||
+           state->is_version(420, 0))) {
          if (match_layout_qualifier($1, "depth_any", state) == 0) {
             $$.flags.q.depth_any = 1;
          } else if (match_layout_qualifier($1, "depth_greater", state) == 0) {
@@ -1385,6 +1337,89 @@ layout_qualifier_id:
          }
       }
 
+      /* Layout qualifiers for tessellation evaluation shaders. */
+      if (!$$.flags.i) {
+         struct {
+            const char *s;
+            GLenum e;
+         } map[] = {
+                 /* triangles already parsed by gs-specific code */
+                 { "quads", GL_QUADS },
+                 { "isolines", GL_ISOLINES },
+         };
+         for (unsigned i = 0; i < ARRAY_SIZE(map); i++) {
+            if (match_layout_qualifier($1, map[i].s, state) == 0) {
+               $$.flags.q.prim_type = 1;
+               $$.prim_type = map[i].e;
+               break;
+            }
+         }
+
+         if ($$.flags.i &&
+             !state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "primitive mode qualifier `%s' requires "
+                             "GLSL 4.00 or ARB_tessellation_shader", $1);
+         }
+      }
+      if (!$$.flags.i) {
+         struct {
+            const char *s;
+            GLenum e;
+         } map[] = {
+                 { "equal_spacing", GL_EQUAL },
+                 { "fractional_odd_spacing", GL_FRACTIONAL_ODD },
+                 { "fractional_even_spacing", GL_FRACTIONAL_EVEN },
+         };
+         for (unsigned i = 0; i < ARRAY_SIZE(map); i++) {
+            if (match_layout_qualifier($1, map[i].s, state) == 0) {
+               $$.flags.q.vertex_spacing = 1;
+               $$.vertex_spacing = map[i].e;
+               break;
+            }
+         }
+
+         if ($$.flags.i &&
+             !state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "vertex spacing qualifier `%s' requires "
+                             "GLSL 4.00 or ARB_tessellation_shader", $1);
+         }
+      }
+      if (!$$.flags.i) {
+         if (match_layout_qualifier($1, "cw", state) == 0) {
+            $$.flags.q.ordering = 1;
+            $$.ordering = GL_CW;
+         } else if (match_layout_qualifier($1, "ccw", state) == 0) {
+            $$.flags.q.ordering = 1;
+            $$.ordering = GL_CCW;
+         }
+
+         if ($$.flags.i &&
+             !state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "ordering qualifier `%s' requires "
+                             "GLSL 4.00 or ARB_tessellation_shader", $1);
+         }
+      }
+      if (!$$.flags.i) {
+         if (match_layout_qualifier($1, "point_mode", state) == 0) {
+            $$.flags.q.point_mode = 1;
+            $$.point_mode = true;
+         }
+
+         if ($$.flags.i &&
+             !state->ARB_tessellation_shader_enable &&
+             !state->is_version(400, 0)) {
+            _mesa_glsl_error(& @1, state,
+                             "qualifier `point_mode' requires "
+                             "GLSL 4.00 or ARB_tessellation_shader");
+         }
+      }
+
       if (!$$.flags.i) {
          _mesa_glsl_error(& @1, state, "unrecognized layout identifier "
                           "`%s'", $1);
@@ -1524,6 +1559,30 @@ layout_qualifier_id:
          }
       }
 
+      /* Layout qualifiers for tessellation control shaders. */
+      if (match_layout_qualifier("vertices", $1, state) == 0) {
+         $$.flags.q.vertices = 1;
+
+         if ($3 <= 0) {
+            _mesa_glsl_error(& @3, state,
+                             "invalid vertices (%d) specified", $3);
+            YYERROR;
+         } else if ($3 > (int)state->Const.MaxPatchVertices) {
+            _mesa_glsl_error(& @3, state,
+                             "vertices (%d) exceeds "
+                             "GL_MAX_PATCH_VERTICES", $3);
+            YYERROR;
+         } else {
+            $$.vertices = $3;
+            if (!state->ARB_tessellation_shader_enable &&
+                !state->is_version(400, 0)) {
+               _mesa_glsl_error(& @1, state,
+                                "vertices qualifier requires GLSL 4.00 or "
+                                "ARB_tessellation_shader");
+            }
+         }
+      }
+
       /* If the identifier didn't match any known layout identifiers,
        * emit an error.
        */
@@ -1571,6 +1630,41 @@ interface_block_layout_qualifier:
    }
    ;
 
+subroutine_qualifier:
+   SUBROUTINE
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.subroutine = 1;
+   }
+   | SUBROUTINE '(' subroutine_type_list ')'
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.subroutine_def = 1;
+      $$.subroutine_list = $3;
+   }
+   ;
+
+subroutine_type_list:
+   any_identifier
+   {
+        void *ctx = state;
+        ast_declaration *decl = new(ctx)  ast_declaration($1, NULL, NULL);
+        decl->set_location(@1);
+
+        $$ = new(ctx) ast_subroutine_list();
+        $$->declarations.push_tail(&decl->link);
+   }
+   | subroutine_type_list ',' any_identifier
+   {
+        void *ctx = state;
+        ast_declaration *decl = new(ctx)  ast_declaration($3, NULL, NULL);
+        decl->set_location(@3);
+
+        $$ = $1;
+        $$->declarations.push_tail(&decl->link);
+   }
+   ;
+
 interpolation_qualifier:
    SMOOTH
    {
@@ -1606,6 +1700,7 @@ type_qualifier:
    | interpolation_qualifier
    | layout_qualifier
    | memory_qualifier
+   | subroutine_qualifier
    | precision_qualifier
    {
       memset(&$$, 0, sizeof($$));
@@ -1637,7 +1732,7 @@ type_qualifier:
       if ($2.flags.q.invariant)
          _mesa_glsl_error(&@1, state, "duplicate \"invariant\" qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable && $2.flags.q.precise)
+      if (!state->has_420pack() && $2.flags.q.precise)
          _mesa_glsl_error(&@1, state,
                           "\"invariant\" must come after \"precise\"");
 
@@ -1670,7 +1765,7 @@ type_qualifier:
       if ($2.has_interpolation())
          _mesa_glsl_error(&@1, state, "duplicate interpolation qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable &&
+      if (!state->has_420pack() &&
           ($2.flags.q.precise || $2.flags.q.invariant)) {
          _mesa_glsl_error(&@1, state, "interpolation qualifiers must come "
                           "after \"precise\" or \"invariant\"");
@@ -1690,12 +1785,17 @@ type_qualifier:
        * precise qualifiers since these are useful in ARB_separate_shader_objects.
        * There is no clear spec guidance on this either.
        */
-      if (!state->ARB_shading_language_420pack_enable && $2.has_layout())
+      if (!state->has_420pack() && $2.has_layout())
          _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
 
       $$ = $1;
       $$.merge_qualifier(&@1, state, $2);
    }
+   | subroutine_qualifier type_qualifier
+   {
+      $$ = $1;
+      $$.merge_qualifier(&@1, state, $2);
+   }
    | auxiliary_storage_qualifier type_qualifier
    {
       if ($2.has_auxiliary_storage()) {
@@ -1703,7 +1803,7 @@ type_qualifier:
                           "duplicate auxiliary storage qualifier (centroid or sample)");
       }
 
-      if (!state->ARB_shading_language_420pack_enable &&
+      if (!state->has_420pack() &&
           ($2.flags.q.precise || $2.flags.q.invariant ||
            $2.has_interpolation() || $2.has_layout())) {
          _mesa_glsl_error(&@1, state, "auxiliary storage qualifiers must come "
@@ -1721,7 +1821,7 @@ type_qualifier:
       if ($2.has_storage())
          _mesa_glsl_error(&@1, state, "duplicate storage qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable &&
+      if (!state->has_420pack() &&
           ($2.flags.q.precise || $2.flags.q.invariant || $2.has_interpolation() ||
            $2.has_layout() || $2.has_auxiliary_storage())) {
          _mesa_glsl_error(&@1, state, "storage qualifiers must come after "
@@ -1737,7 +1837,7 @@ type_qualifier:
       if ($2.precision != ast_precision_none)
          _mesa_glsl_error(&@1, state, "duplicate precision qualifier");
 
-      if (!state->ARB_shading_language_420pack_enable && $2.flags.i != 0)
+      if (!state->has_420pack() && $2.flags.i != 0)
          _mesa_glsl_error(&@1, state, "precision qualifiers must come last");
 
       $$ = $2;
@@ -1761,7 +1861,11 @@ auxiliary_storage_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.sample = 1;
    }
-   /* TODO: "patch" also goes here someday. */
+   | PATCH
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.patch = 1;
+   }
 
 storage_qualifier:
    CONST_TOK
@@ -1808,6 +1912,11 @@ storage_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.uniform = 1;
    }
+   | BUFFER
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.buffer = 1;
+   }
    ;
 
 memory_qualifier:
@@ -2510,7 +2619,17 @@ basic_interface_block:
       block->block_name = $2;
       block->declarations.push_degenerate_list_at_head(& $4->link);
 
-      if ($1.flags.q.uniform) {
+      if ($1.flags.q.buffer) {
+         if (!state->has_shader_storage_buffer_objects()) {
+            _mesa_glsl_error(& @1, state,
+                             "#version 430 / GL_ARB_shader_storage_buffer_object "
+                             "required for defining shader storage blocks");
+         } else if (state->ARB_shader_storage_buffer_object_warn) {
+            _mesa_glsl_warning(& @1, state,
+                               "#version 430 / GL_ARB_shader_storage_buffer_object "
+                               "required for defining shader storage blocks");
+         }
+      } else if ($1.flags.q.uniform) {
          if (!state->has_uniform_buffer_objects()) {
             _mesa_glsl_error(& @1, state,
                              "#version 140 / GL_ARB_uniform_buffer_object "
@@ -2554,11 +2673,13 @@ basic_interface_block:
       uint64_t interface_type_mask;
       struct ast_type_qualifier temp_type_qualifier;
 
-      /* Get a bitmask containing only the in/out/uniform flags, allowing us
-       * to ignore other irrelevant flags like interpolation qualifiers.
+      /* Get a bitmask containing only the in/out/uniform/buffer
+       * flags, allowing us to ignore other irrelevant flags like
+       * interpolation qualifiers.
        */
       temp_type_qualifier.flags.i = 0;
       temp_type_qualifier.flags.q.uniform = true;
+      temp_type_qualifier.flags.q.buffer = true;
       temp_type_qualifier.flags.q.in = true;
       temp_type_qualifier.flags.q.out = true;
       interface_type_mask = temp_type_qualifier.flags.i;
@@ -2645,6 +2766,11 @@ interface_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.uniform = 1;
    }
+   | BUFFER
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.buffer = 1;
+   }
    ;
 
 instance_name_opt:
@@ -2723,11 +2849,8 @@ layout_defaults:
 
    | layout_qualifier OUT_TOK ';'
    {
-      if (state->stage != MESA_SHADER_GEOMETRY) {
-         _mesa_glsl_error(& @1, state,
-                          "out layout qualifiers only valid in "
-                          "geometry shaders");
-      } else {
+      $$ = NULL;
+      if (state->stage == MESA_SHADER_GEOMETRY) {
          if ($1.flags.q.prim_type) {
             /* Make sure this is a valid output primitive type. */
             switch ($1.prim_type) {
@@ -2746,6 +2869,12 @@ layout_defaults:
 
          /* Allow future assigments of global out's stream id value */
          state->out_qualifier->flags.q.explicit_stream = 0;
+      } else if (state->stage == MESA_SHADER_TESS_CTRL) {
+         if (!state->out_qualifier->merge_out_qualifier(& @1, state, $1, $$))
+            YYERROR;
+      } else {
+         _mesa_glsl_error(& @1, state,
+                          "out layout qualifiers only valid in "
+                          "tessellation control or geometry shaders");
       }
-      $$ = NULL;
    }
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index e26931de42f..ae2f35697fb 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -115,12 +115,18 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->Const.MaxGeometryUniformComponents = ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxUniformComponents;
 
    this->Const.MaxVertexAtomicCounters = ctx->Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters;
+   this->Const.MaxTessControlAtomicCounters = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicCounters;
+   this->Const.MaxTessEvaluationAtomicCounters = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicCounters;
    this->Const.MaxGeometryAtomicCounters = ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicCounters;
    this->Const.MaxFragmentAtomicCounters = ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters;
    this->Const.MaxCombinedAtomicCounters = ctx->Const.MaxCombinedAtomicCounters;
    this->Const.MaxAtomicBufferBindings = ctx->Const.MaxAtomicBufferBindings;
    this->Const.MaxVertexAtomicCounterBuffers =
       ctx->Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers;
+   this->Const.MaxTessControlAtomicCounterBuffers =
+      ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicBuffers;
+   this->Const.MaxTessEvaluationAtomicCounterBuffers =
+      ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicBuffers;
    this->Const.MaxGeometryAtomicCounterBuffers =
       ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers;
    this->Const.MaxFragmentAtomicCounterBuffers =
@@ -140,6 +146,8 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->Const.MaxCombinedImageUnitsAndFragmentOutputs = ctx->Const.MaxCombinedImageUnitsAndFragmentOutputs;
    this->Const.MaxImageSamples = ctx->Const.MaxImageSamples;
    this->Const.MaxVertexImageUniforms = ctx->Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms;
+   this->Const.MaxTessControlImageUniforms = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxImageUniforms;
+   this->Const.MaxTessEvaluationImageUniforms = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxImageUniforms;
    this->Const.MaxGeometryImageUniforms = ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms;
    this->Const.MaxFragmentImageUniforms = ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms;
    this->Const.MaxCombinedImageUniforms = ctx->Const.MaxCombinedImageUniforms;
@@ -147,12 +155,30 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    /* ARB_viewport_array */
    this->Const.MaxViewports = ctx->Const.MaxViewports;
 
+   /* tessellation shader constants */
+   this->Const.MaxPatchVertices = ctx->Const.MaxPatchVertices;
+   this->Const.MaxTessGenLevel = ctx->Const.MaxTessGenLevel;
+   this->Const.MaxTessControlInputComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxInputComponents;
+   this->Const.MaxTessControlOutputComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxOutputComponents;
+   this->Const.MaxTessControlTextureImageUnits = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits;
+   this->Const.MaxTessEvaluationInputComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxInputComponents;
+   this->Const.MaxTessEvaluationOutputComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxOutputComponents;
+   this->Const.MaxTessEvaluationTextureImageUnits = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits;
+   this->Const.MaxTessPatchComponents = ctx->Const.MaxTessPatchComponents;
+   this->Const.MaxTessControlTotalOutputComponents = ctx->Const.MaxTessControlTotalOutputComponents;
+   this->Const.MaxTessControlUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents;
+   this->Const.MaxTessEvaluationUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents;
+
    this->current_function = NULL;
    this->toplevel_ir = NULL;
    this->found_return = false;
    this->all_invariant = false;
    this->user_structures = NULL;
    this->num_user_structures = 0;
+   this->num_subroutines = 0;
+   this->subroutines = NULL;
+   this->num_subroutine_types = 0;
+   this->subroutine_types = NULL;
 
    /* supported_versions should be large enough to support the known desktop
     * GLSL versions plus 3 GLES versions (ES 1.00, ES 3.00, and ES 3.10))
@@ -226,6 +252,7 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->fs_redeclares_gl_fragcoord_with_no_layout_qualifiers = false;
 
    this->gs_input_prim_type_specified = false;
+   this->tcs_output_vertices_specified = false;
    this->gs_input_size = 0;
    this->in_qualifier = new(this) ast_type_qualifier();
    this->out_qualifier = new(this) ast_type_qualifier();
@@ -391,6 +418,8 @@ _mesa_shader_stage_to_string(unsigned stage)
    case MESA_SHADER_FRAGMENT: return "fragment";
    case MESA_SHADER_GEOMETRY: return "geometry";
    case MESA_SHADER_COMPUTE:  return "compute";
+   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
+   case MESA_SHADER_TESS_EVAL: return "tess eval";
    }
 
    unreachable("Unknown shader stage.");
@@ -408,6 +437,8 @@ _mesa_shader_stage_to_abbrev(unsigned stage)
    case MESA_SHADER_FRAGMENT: return "FS";
    case MESA_SHADER_GEOMETRY: return "GS";
    case MESA_SHADER_COMPUTE:  return "CS";
+   case MESA_SHADER_TESS_CTRL: return "TCS";
+   case MESA_SHADER_TESS_EVAL: return "TES";
    }
 
    unreachable("Unknown shader stage.");
@@ -553,37 +584,40 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
 
    /* ARB extensions go here, sorted alphabetically.
     */
-   EXT(ARB_arrays_of_arrays,           true,  false,     ARB_arrays_of_arrays),
-   EXT(ARB_compute_shader,             true,  false,     ARB_compute_shader),
-   EXT(ARB_conservative_depth,         true,  false,     ARB_conservative_depth),
-   EXT(ARB_derivative_control,         true,  false,     ARB_derivative_control),
-   EXT(ARB_draw_buffers,               true,  false,     dummy_true),
-   EXT(ARB_draw_instanced,             true,  false,     ARB_draw_instanced),
-   EXT(ARB_explicit_attrib_location,   true,  false,     ARB_explicit_attrib_location),
-   EXT(ARB_explicit_uniform_location,  true,  false,     ARB_explicit_uniform_location),
-   EXT(ARB_fragment_coord_conventions, true,  false,     ARB_fragment_coord_conventions),
-   EXT(ARB_fragment_layer_viewport,    true,  false,     ARB_fragment_layer_viewport),
-   EXT(ARB_gpu_shader5,                true,  false,     ARB_gpu_shader5),
-   EXT(ARB_gpu_shader_fp64,            true,  false,     ARB_gpu_shader_fp64),
-   EXT(ARB_sample_shading,             true,  false,     ARB_sample_shading),
-   EXT(ARB_separate_shader_objects,    true,  false,     dummy_true),
-   EXT(ARB_shader_atomic_counters,     true,  false,     ARB_shader_atomic_counters),
-   EXT(ARB_shader_bit_encoding,        true,  false,     ARB_shader_bit_encoding),
-   EXT(ARB_shader_image_load_store,    true,  false,     ARB_shader_image_load_store),
-   EXT(ARB_shader_precision,           true,  false,     ARB_shader_precision),
-   EXT(ARB_shader_stencil_export,      true,  false,     ARB_shader_stencil_export),
-   EXT(ARB_shader_texture_lod,         true,  false,     ARB_shader_texture_lod),
-   EXT(ARB_shading_language_420pack,   true,  false,     ARB_shading_language_420pack),
-   EXT(ARB_shading_language_packing,   true,  false,     ARB_shading_language_packing),
-   EXT(ARB_texture_cube_map_array,     true,  false,     ARB_texture_cube_map_array),
-   EXT(ARB_texture_gather,             true,  false,     ARB_texture_gather),
-   EXT(ARB_texture_multisample,        true,  false,     ARB_texture_multisample),
-   EXT(ARB_texture_query_levels,       true,  false,     ARB_texture_query_levels),
-   EXT(ARB_texture_query_lod,          true,  false,     ARB_texture_query_lod),
-   EXT(ARB_texture_rectangle,          true,  false,     dummy_true),
-   EXT(ARB_uniform_buffer_object,      true,  false,     ARB_uniform_buffer_object),
-   EXT(ARB_vertex_attrib_64bit,        true,  false,     ARB_vertex_attrib_64bit),
-   EXT(ARB_viewport_array,             true,  false,     ARB_viewport_array),
+   EXT(ARB_arrays_of_arrays,             true,  false,     ARB_arrays_of_arrays),
+   EXT(ARB_compute_shader,               true,  false,     ARB_compute_shader),
+   EXT(ARB_conservative_depth,           true,  false,     ARB_conservative_depth),
+   EXT(ARB_derivative_control,           true,  false,     ARB_derivative_control),
+   EXT(ARB_draw_buffers,                 true,  false,     dummy_true),
+   EXT(ARB_draw_instanced,               true,  false,     ARB_draw_instanced),
+   EXT(ARB_explicit_attrib_location,     true,  false,     ARB_explicit_attrib_location),
+   EXT(ARB_explicit_uniform_location,    true,  false,     ARB_explicit_uniform_location),
+   EXT(ARB_fragment_coord_conventions,   true,  false,     ARB_fragment_coord_conventions),
+   EXT(ARB_fragment_layer_viewport,      true,  false,     ARB_fragment_layer_viewport),
+   EXT(ARB_gpu_shader5,                  true,  false,     ARB_gpu_shader5),
+   EXT(ARB_gpu_shader_fp64,              true,  false,     ARB_gpu_shader_fp64),
+   EXT(ARB_sample_shading,               true,  false,     ARB_sample_shading),
+   EXT(ARB_separate_shader_objects,      true,  false,     dummy_true),
+   EXT(ARB_shader_atomic_counters,       true,  false,     ARB_shader_atomic_counters),
+   EXT(ARB_shader_bit_encoding,          true,  false,     ARB_shader_bit_encoding),
+   EXT(ARB_shader_image_load_store,      true,  false,     ARB_shader_image_load_store),
+   EXT(ARB_shader_precision,             true,  false,     ARB_shader_precision),
+   EXT(ARB_shader_stencil_export,        true,  false,     ARB_shader_stencil_export),
+   EXT(ARB_shader_storage_buffer_object, true,  false,     ARB_shader_storage_buffer_object),
+   EXT(ARB_shader_subroutine,            true,  false,     ARB_shader_subroutine),
+   EXT(ARB_shader_texture_lod,           true,  false,     ARB_shader_texture_lod),
+   EXT(ARB_shading_language_420pack,     true,  false,     ARB_shading_language_420pack),
+   EXT(ARB_shading_language_packing,     true,  false,     ARB_shading_language_packing),
+   EXT(ARB_tessellation_shader,          true,  false,     ARB_tessellation_shader),
+   EXT(ARB_texture_cube_map_array,       true,  false,     ARB_texture_cube_map_array),
+   EXT(ARB_texture_gather,               true,  false,     ARB_texture_gather),
+   EXT(ARB_texture_multisample,          true,  false,     ARB_texture_multisample),
+   EXT(ARB_texture_query_levels,         true,  false,     ARB_texture_query_levels),
+   EXT(ARB_texture_query_lod,            true,  false,     ARB_texture_query_lod),
+   EXT(ARB_texture_rectangle,            true,  false,     dummy_true),
+   EXT(ARB_uniform_buffer_object,        true,  false,     ARB_uniform_buffer_object),
+   EXT(ARB_vertex_attrib_64bit,          true,  false,     ARB_vertex_attrib_64bit),
+   EXT(ARB_viewport_array,               true,  false,     ARB_viewport_array),
 
    /* KHR extensions go here, sorted alphabetically.
     */
@@ -827,6 +861,15 @@ _mesa_ast_set_aggregate_type(const glsl_type *type,
 void
 _mesa_ast_type_qualifier_print(const struct ast_type_qualifier *q)
 {
+   if (q->flags.q.subroutine)
+      printf("subroutine ");
+
+   if (q->flags.q.subroutine_def) {
+      printf("subroutine (");
+      q->subroutine_list->print();
+      printf(")");
+   }
+
    if (q->flags.q.constant)
       printf("const ");
 
@@ -853,8 +896,12 @@ _mesa_ast_type_qualifier_print(const struct ast_type_qualifier *q)
       printf("centroid ");
    if (q->flags.q.sample)
       printf("sample ");
+   if (q->flags.q.patch)
+      printf("patch ");
    if (q->flags.q.uniform)
       printf("uniform ");
+   if (q->flags.q.buffer)
+      printf("buffer ");
    if (q->flags.q.smooth)
       printf("smooth ");
    if (q->flags.q.flat)
@@ -1415,12 +1462,25 @@ ast_struct_specifier::ast_struct_specifier(const char *identifier,
    is_declaration = true;
 }
 
+void ast_subroutine_list::print(void) const
+{
+   foreach_list_typed (ast_node, ast, link, & this->declarations) {
+      if (&ast->link != this->declarations.get_head())
+         printf(", ");
+      ast->print();
+   }
+}
+
 static void
 set_shader_inout_layout(struct gl_shader *shader,
 		     struct _mesa_glsl_parse_state *state)
 {
-   if (shader->Stage != MESA_SHADER_GEOMETRY) {
-      /* Should have been prevented by the parser. */
+   /* Should have been prevented by the parser. */
+   if (shader->Stage == MESA_SHADER_TESS_CTRL) {
+      assert(!state->in_qualifier->flags.i);
+   } else if (shader->Stage == MESA_SHADER_TESS_EVAL) {
+      assert(!state->out_qualifier->flags.i);
+   } else if (shader->Stage != MESA_SHADER_GEOMETRY) {
       assert(!state->in_qualifier->flags.i);
       assert(!state->out_qualifier->flags.i);
    }
@@ -1440,6 +1500,28 @@ set_shader_inout_layout(struct gl_shader *shader,
    }
 
    switch (shader->Stage) {
+   case MESA_SHADER_TESS_CTRL:
+      shader->TessCtrl.VerticesOut = 0;
+      if (state->tcs_output_vertices_specified)
+         shader->TessCtrl.VerticesOut = state->out_qualifier->vertices;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      shader->TessEval.PrimitiveMode = PRIM_UNKNOWN;
+      if (state->in_qualifier->flags.q.prim_type)
+         shader->TessEval.PrimitiveMode = state->in_qualifier->prim_type;
+
+      shader->TessEval.Spacing = 0;
+      if (state->in_qualifier->flags.q.vertex_spacing)
+         shader->TessEval.Spacing = state->in_qualifier->vertex_spacing;
+
+      shader->TessEval.VertexOrder = 0;
+      if (state->in_qualifier->flags.q.ordering)
+         shader->TessEval.VertexOrder = state->in_qualifier->ordering;
+
+      shader->TessEval.PointMode = -1;
+      if (state->in_qualifier->flags.q.point_mode)
+         shader->TessEval.PointMode = state->in_qualifier->point_mode;
+      break;
    case MESA_SHADER_GEOMETRY:
       shader->Geom.VerticesOut = 0;
       if (state->out_qualifier->flags.q.max_vertices)
@@ -1537,6 +1619,7 @@ _mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader,
       struct gl_shader_compiler_options *options =
          &ctx->Const.ShaderCompilerOptions[shader->Stage];
 
+      lower_subroutine(shader->ir, state);
       /* Do some optimization at compile time to reduce shader IR size
        * and reduce later work if the same shader is linked multiple times
        */
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 9a0c24e6787..eb325f04eed 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -129,7 +129,7 @@ struct _mesa_glsl_parse_state {
    bool check_explicit_attrib_stream_allowed(YYLTYPE *locp)
    {
       if (!this->has_explicit_attrib_stream()) {
-         const char *const requirement = "GL_ARB_gpu_shader5 extension or GLSL 400";
+         const char *const requirement = "GL_ARB_gpu_shader5 extension or GLSL 4.00";
 
          _mesa_glsl_error(locp, this, "explicit stream requires %s",
                           requirement);
@@ -144,8 +144,8 @@ struct _mesa_glsl_parse_state {
    {
       if (!this->has_explicit_attrib_location()) {
          const char *const requirement = this->es_shader
-            ? "GLSL ES 300"
-            : "GL_ARB_explicit_attrib_location extension or GLSL 330";
+            ? "GLSL ES 3.00"
+            : "GL_ARB_explicit_attrib_location extension or GLSL 3.30";
 
          _mesa_glsl_error(locp, this, "%s explicit location requires %s",
                           mode_string(var), requirement);
@@ -160,8 +160,8 @@ struct _mesa_glsl_parse_state {
    {
       if (!this->has_separate_shader_objects()) {
          const char *const requirement = this->es_shader
-            ? "GL_EXT_separate_shader_objects extension or GLSL ES 310"
-            : "GL_ARB_separate_shader_objects extension or GLSL 420";
+            ? "GL_EXT_separate_shader_objects extension or GLSL ES 3.10"
+            : "GL_ARB_separate_shader_objects extension or GLSL 4.20";
 
          _mesa_glsl_error(locp, this, "%s explicit location requires %s",
                           mode_string(var), requirement);
@@ -177,9 +177,9 @@ struct _mesa_glsl_parse_state {
       if (!this->has_explicit_attrib_location() ||
           !this->has_explicit_uniform_location()) {
          const char *const requirement = this->es_shader
-            ? "GLSL ES 310"
+            ? "GLSL ES 3.10"
             : "GL_ARB_explicit_uniform_location and either "
-              "GL_ARB_explicit_attrib_location or GLSL 330.";
+              "GL_ARB_explicit_attrib_location or GLSL 3.30.";
 
          _mesa_glsl_error(locp, this,
                           "uniform explicit location requires %s",
@@ -215,6 +215,11 @@ struct _mesa_glsl_parse_state {
       return ARB_uniform_buffer_object_enable || is_version(140, 300);
    }
 
+   bool has_shader_storage_buffer_objects() const
+   {
+      return ARB_shader_storage_buffer_object_enable || is_version(430, 0);
+   }
+
    bool has_separate_shader_objects() const
    {
       return ARB_separate_shader_objects_enable || is_version(410, 310)
@@ -226,6 +231,11 @@ struct _mesa_glsl_parse_state {
       return ARB_gpu_shader_fp64_enable || is_version(400, 0);
    }
 
+   bool has_420pack() const
+   {
+      return ARB_shading_language_420pack_enable || is_version(420, 0);
+   }
+
    void process_version_directive(YYLTYPE *locp, int version,
                                   const char *ident);
 
@@ -272,15 +282,19 @@ struct _mesa_glsl_parse_state {
    bool fs_redeclares_gl_fragcoord_with_no_layout_qualifiers;
 
    /**
-    * True if a geometry shader input primitive type was specified using a
-    * layout directive.
+    * True if a geometry shader input primitive type or tessellation control
+    * output vertices were specified using a layout directive.
     *
-    * Note: this value is computed at ast_to_hir time rather than at parse
+    * Note: these values are computed at ast_to_hir time rather than at parse
     * time.
     */
    bool gs_input_prim_type_specified;
+   bool tcs_output_vertices_specified;
 
-   /** Input layout qualifiers from GLSL 1.50. (geometry shader controls)*/
+   /**
+    * Input layout qualifiers from GLSL 1.50 (geometry shader controls),
+    * and GLSL 4.00 (tessellation evaluation shader)
+    */
    struct ast_type_qualifier *in_qualifier;
 
    /**
@@ -298,7 +312,10 @@ struct _mesa_glsl_parse_state {
     */
    unsigned cs_input_local_size[3];
 
-   /** Output layout qualifiers from GLSL 1.50. (geometry shader controls)*/
+   /**
+    * Output layout qualifiers from GLSL 1.50 (geometry shader controls),
+    * and GLSL 4.00 (tessellation control shader).
+    */
    struct ast_type_qualifier *out_qualifier;
 
    /**
@@ -348,6 +365,8 @@ struct _mesa_glsl_parse_state {
 
       /* ARB_shader_atomic_counters */
       unsigned MaxVertexAtomicCounters;
+      unsigned MaxTessControlAtomicCounters;
+      unsigned MaxTessEvaluationAtomicCounters;
       unsigned MaxGeometryAtomicCounters;
       unsigned MaxFragmentAtomicCounters;
       unsigned MaxCombinedAtomicCounters;
@@ -358,6 +377,8 @@ struct _mesa_glsl_parse_state {
        * 3.10.
        */
       unsigned MaxVertexAtomicCounterBuffers;
+      unsigned MaxTessControlAtomicCounterBuffers;
+      unsigned MaxTessEvaluationAtomicCounterBuffers;
       unsigned MaxGeometryAtomicCounterBuffers;
       unsigned MaxFragmentAtomicCounterBuffers;
       unsigned MaxCombinedAtomicCounterBuffers;
@@ -372,12 +393,28 @@ struct _mesa_glsl_parse_state {
       unsigned MaxCombinedImageUnitsAndFragmentOutputs;
       unsigned MaxImageSamples;
       unsigned MaxVertexImageUniforms;
+      unsigned MaxTessControlImageUniforms;
+      unsigned MaxTessEvaluationImageUniforms;
       unsigned MaxGeometryImageUniforms;
       unsigned MaxFragmentImageUniforms;
       unsigned MaxCombinedImageUniforms;
 
       /* ARB_viewport_array */
       unsigned MaxViewports;
+
+      /* ARB_tessellation_shader */
+      unsigned MaxPatchVertices;
+      unsigned MaxTessGenLevel;
+      unsigned MaxTessControlInputComponents;
+      unsigned MaxTessControlOutputComponents;
+      unsigned MaxTessControlTextureImageUnits;
+      unsigned MaxTessEvaluationInputComponents;
+      unsigned MaxTessEvaluationOutputComponents;
+      unsigned MaxTessEvaluationTextureImageUnits;
+      unsigned MaxTessPatchComponents;
+      unsigned MaxTessControlTotalOutputComponents;
+      unsigned MaxTessControlUniformComponents;
+      unsigned MaxTessEvaluationUniformComponents;
    } Const;
 
    /**
@@ -462,12 +499,18 @@ struct _mesa_glsl_parse_state {
    bool ARB_shader_precision_warn;
    bool ARB_shader_stencil_export_enable;
    bool ARB_shader_stencil_export_warn;
+   bool ARB_shader_storage_buffer_object_enable;
+   bool ARB_shader_storage_buffer_object_warn;
+   bool ARB_shader_subroutine_enable;
+   bool ARB_shader_subroutine_warn;
    bool ARB_shader_texture_lod_enable;
    bool ARB_shader_texture_lod_warn;
    bool ARB_shading_language_420pack_enable;
    bool ARB_shading_language_420pack_warn;
    bool ARB_shading_language_packing_enable;
    bool ARB_shading_language_packing_warn;
+   bool ARB_tessellation_shader_enable;
+   bool ARB_tessellation_shader_warn;
    bool ARB_texture_cube_map_array_enable;
    bool ARB_texture_cube_map_array_warn;
    bool ARB_texture_gather_enable;
@@ -538,10 +581,38 @@ struct _mesa_glsl_parse_state {
 
    bool fs_early_fragment_tests;
 
+   /**
+    * For tessellation control shaders, size of the most recently seen output
+    * declaration that was a sized array, or 0 if no sized output array
+    * declarations have been seen.
+    *
+    * Unused for other shader types.
+    */
+   unsigned tcs_output_size;
+
    /** Atomic counter offsets by binding */
    unsigned atomic_counter_offsets[MAX_COMBINED_ATOMIC_BUFFERS];
 
    bool allow_extension_directive_midshader;
+
+   /**
+    * Known subroutine type declarations.
+    */
+   int num_subroutine_types;
+   ir_function **subroutine_types;
+
+   /**
+    * Functions that are associated with
+    * subroutine types.
+    */
+   int num_subroutines;
+   ir_function **subroutines;
+
+   /**
+    * field selection temporary parser storage -
+    * did the parser just parse a dot.
+    */
+   bool is_field;
 };
 
 # define YYLLOC_DEFAULT(Current, Rhs, N)			\
diff --git a/src/glsl/glsl_symbol_table.cpp b/src/glsl/glsl_symbol_table.cpp
index 2294dda42c8..536f0a3a8c2 100644
--- a/src/glsl/glsl_symbol_table.cpp
+++ b/src/glsl/glsl_symbol_table.cpp
@@ -36,6 +36,9 @@ public:
       case ir_var_uniform:
          dest = &ibu;
          break;
+      case ir_var_shader_storage:
+         dest = &iss;
+         break;
       case ir_var_shader_in:
          dest = &ibi;
          break;
@@ -60,6 +63,8 @@ public:
       switch (mode) {
       case ir_var_uniform:
          return ibu;
+      case ir_var_shader_storage:
+         return iss;
       case ir_var_shader_in:
          return ibi;
       case ir_var_shader_out:
@@ -71,24 +76,25 @@ public:
    }
 
    symbol_table_entry(ir_variable *v)               :
-      v(v), f(0), t(0), ibu(0), ibi(0), ibo(0), a(0) {}
+      v(v), f(0), t(0), ibu(0), iss(0), ibi(0), ibo(0), a(0) {}
    symbol_table_entry(ir_function *f)               :
-      v(0), f(f), t(0), ibu(0), ibi(0), ibo(0), a(0) {}
+      v(0), f(f), t(0), ibu(0), iss(0), ibi(0), ibo(0), a(0) {}
    symbol_table_entry(const glsl_type *t)           :
-      v(0), f(0), t(t), ibu(0), ibi(0), ibo(0), a(0) {}
+      v(0), f(0), t(t), ibu(0), iss(0), ibi(0), ibo(0), a(0) {}
    symbol_table_entry(const glsl_type *t, enum ir_variable_mode mode) :
-      v(0), f(0), t(0), ibu(0), ibi(0), ibo(0), a(0)
+      v(0), f(0), t(0), ibu(0), iss(0), ibi(0), ibo(0), a(0)
    {
       assert(t->is_interface());
       add_interface(t, mode);
    }
    symbol_table_entry(const class ast_type_specifier *a):
-      v(0), f(0), t(0), ibu(0), ibi(0), ibo(0), a(a) {}
+      v(0), f(0), t(0), ibu(0), iss(0), ibi(0), ibo(0), a(a) {}
 
    ir_variable *v;
    ir_function *f;
    const glsl_type *t;
    const glsl_type *ibu;
+   const glsl_type *iss;
    const glsl_type *ibi;
    const glsl_type *ibo;
    const class ast_type_specifier *a;
diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index 37406b8073e..76814e894ed 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -25,7 +25,7 @@
 #include "main/core.h" /* for Elements, MAX2 */
 #include "glsl_parser_extras.h"
 #include "glsl_types.h"
-#include "program/hash_table.h"
+#include "util/hash_table.h"
 
 
 mtx_t glsl_type::mutex = _MTX_INITIALIZER_NP;
@@ -33,6 +33,7 @@ hash_table *glsl_type::array_types = NULL;
 hash_table *glsl_type::record_types = NULL;
 hash_table *glsl_type::interface_types = NULL;
 hash_table *glsl_type::function_types = NULL;
+hash_table *glsl_type::subroutine_types = NULL;
 void *glsl_type::mem_ctx = NULL;
 
 void
@@ -123,6 +124,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].centroid = fields[i].centroid;
       this->fields.structure[i].sample = fields[i].sample;
       this->fields.structure[i].matrix_layout = fields[i].matrix_layout;
+      this->fields.structure[i].patch = fields[i].patch;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -155,6 +157,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].centroid = fields[i].centroid;
       this->fields.structure[i].sample = fields[i].sample;
       this->fields.structure[i].matrix_layout = fields[i].matrix_layout;
+      this->fields.structure[i].patch = fields[i].patch;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -193,6 +196,22 @@ glsl_type::glsl_type(const glsl_type *return_type,
    mtx_unlock(&glsl_type::mutex);
 }
 
+glsl_type::glsl_type(const char *subroutine_name) :
+   gl_type(0),
+   base_type(GLSL_TYPE_SUBROUTINE),
+   sampler_dimensionality(0), sampler_shadow(0), sampler_array(0),
+   sampler_type(0), interface_packing(0),
+   vector_elements(0), matrix_columns(0),
+   length(0)
+{
+   mtx_lock(&glsl_type::mutex);
+
+   init_ralloc_type_ctx();
+   assert(subroutine_name != NULL);
+   this->name = ralloc_strdup(this->mem_ctx, subroutine_name);
+   this->vector_elements = 1;
+   mtx_unlock(&glsl_type::mutex);
+}
 
 bool
 glsl_type::contains_sampler() const
@@ -263,6 +282,22 @@ glsl_type::contains_opaque() const {
    }
 }
 
+bool
+glsl_type::contains_subroutine() const
+{
+   if (this->is_array()) {
+      return this->fields.array->contains_subroutine();
+   } else if (this->is_record()) {
+      for (unsigned int i = 0; i < this->length; i++) {
+	 if (this->fields.structure[i].type->contains_subroutine())
+	    return true;
+      }
+      return false;
+   } else {
+      return this->is_subroutine();
+   }
+}
+
 gl_texture_index
 glsl_type::sampler_index() const
 {
@@ -358,19 +393,24 @@ const glsl_type *glsl_type::get_scalar_type() const
 void
 _mesa_glsl_release_types(void)
 {
-   mtx_lock(&glsl_type::mutex);
-
+   /* Should only be called during atexit (either when unloading shared
+    * object, or if process terminates), so no mutex-locking should be
+    * necessary.
+    */
    if (glsl_type::array_types != NULL) {
-      hash_table_dtor(glsl_type::array_types);
+      _mesa_hash_table_destroy(glsl_type::array_types, NULL);
       glsl_type::array_types = NULL;
    }
 
    if (glsl_type::record_types != NULL) {
-      hash_table_dtor(glsl_type::record_types);
+      _mesa_hash_table_destroy(glsl_type::record_types, NULL);
       glsl_type::record_types = NULL;
    }
 
-   mtx_unlock(&glsl_type::mutex);
+   if (glsl_type::interface_types != NULL) {
+      _mesa_hash_table_destroy(glsl_type::interface_types, NULL);
+      glsl_type::interface_types = NULL;
+   }
 }
 
 
@@ -682,27 +722,28 @@ glsl_type::get_array_instance(const glsl_type *base, unsigned array_size)
    mtx_lock(&glsl_type::mutex);
 
    if (array_types == NULL) {
-      array_types = hash_table_ctor(64, hash_table_string_hash,
-				    hash_table_string_compare);
+      array_types = _mesa_hash_table_create(NULL, _mesa_key_hash_string,
+                                            _mesa_key_string_equal);
    }
 
-   const glsl_type *t = (glsl_type *) hash_table_find(array_types, key);
-
-   if (t == NULL) {
+   const struct hash_entry *entry = _mesa_hash_table_search(array_types, key);
+   if (entry == NULL) {
       mtx_unlock(&glsl_type::mutex);
-      t = new glsl_type(base, array_size);
+      const glsl_type *t = new glsl_type(base, array_size);
       mtx_lock(&glsl_type::mutex);
 
-      hash_table_insert(array_types, (void *) t, ralloc_strdup(mem_ctx, key));
+      entry = _mesa_hash_table_insert(array_types,
+                                      ralloc_strdup(mem_ctx, key),
+                                      (void *) t);
    }
 
-   assert(t->base_type == GLSL_TYPE_ARRAY);
-   assert(t->length == array_size);
-   assert(t->fields.array == base);
+   assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_ARRAY);
+   assert(((glsl_type *) entry->data)->length == array_size);
+   assert(((glsl_type *) entry->data)->fields.array == base);
 
    mtx_unlock(&glsl_type::mutex);
 
-   return t;
+   return (glsl_type *) entry->data;
 }
 
 
@@ -750,25 +791,22 @@ glsl_type::record_compare(const glsl_type *b) const
       if (this->fields.structure[i].sample
           != b->fields.structure[i].sample)
          return false;
+      if (this->fields.structure[i].patch
+          != b->fields.structure[i].patch)
+         return false;
    }
 
    return true;
 }
 
 
-int
+bool
 glsl_type::record_key_compare(const void *a, const void *b)
 {
    const glsl_type *const key1 = (glsl_type *) a;
    const glsl_type *const key2 = (glsl_type *) b;
 
-   /* Return zero is the types match (there is zero difference) or non-zero
-    * otherwise.
-    */
-   if (strcmp(key1->name, key2->name) != 0)
-      return 1;
-
-   return !key1->record_compare(key2);
+   return strcmp(key1->name, key2->name) == 0 && key1->record_compare(key2);
 }
 
 
@@ -806,25 +844,27 @@ glsl_type::get_record_instance(const glsl_struct_field *fields,
    mtx_lock(&glsl_type::mutex);
 
    if (record_types == NULL) {
-      record_types = hash_table_ctor(64, record_key_hash, record_key_compare);
+      record_types = _mesa_hash_table_create(NULL, record_key_hash,
+                                             record_key_compare);
    }
 
-   const glsl_type *t = (glsl_type *) hash_table_find(record_types, & key);
-   if (t == NULL) {
+   const struct hash_entry *entry = _mesa_hash_table_search(record_types,
+                                                            &key);
+   if (entry == NULL) {
       mtx_unlock(&glsl_type::mutex);
-      t = new glsl_type(fields, num_fields, name);
+      const glsl_type *t = new glsl_type(fields, num_fields, name);
       mtx_lock(&glsl_type::mutex);
 
-      hash_table_insert(record_types, (void *) t, t);
+      entry = _mesa_hash_table_insert(record_types, t, (void *) t);
    }
 
-   assert(t->base_type == GLSL_TYPE_STRUCT);
-   assert(t->length == num_fields);
-   assert(strcmp(t->name, name) == 0);
+   assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_STRUCT);
+   assert(((glsl_type *) entry->data)->length == num_fields);
+   assert(strcmp(((glsl_type *) entry->data)->name, name) == 0);
 
    mtx_unlock(&glsl_type::mutex);
 
-   return t;
+   return (glsl_type *) entry->data;
 }
 
 
@@ -839,29 +879,62 @@ glsl_type::get_interface_instance(const glsl_struct_field *fields,
    mtx_lock(&glsl_type::mutex);
 
    if (interface_types == NULL) {
-      interface_types = hash_table_ctor(64, record_key_hash, record_key_compare);
+      interface_types = _mesa_hash_table_create(NULL, record_key_hash,
+                                                record_key_compare);
    }
 
-   const glsl_type *t = (glsl_type *) hash_table_find(interface_types, & key);
-   if (t == NULL) {
+   const struct hash_entry *entry = _mesa_hash_table_search(interface_types,
+                                                            &key);
+   if (entry == NULL) {
       mtx_unlock(&glsl_type::mutex);
-      t = new glsl_type(fields, num_fields, packing, block_name);
+      const glsl_type *t = new glsl_type(fields, num_fields,
+                                         packing, block_name);
       mtx_lock(&glsl_type::mutex);
 
-      hash_table_insert(interface_types, (void *) t, t);
+      entry = _mesa_hash_table_insert(interface_types, t, (void *) t);
    }
 
-   assert(t->base_type == GLSL_TYPE_INTERFACE);
-   assert(t->length == num_fields);
-   assert(strcmp(t->name, block_name) == 0);
+   assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_INTERFACE);
+   assert(((glsl_type *) entry->data)->length == num_fields);
+   assert(strcmp(((glsl_type *) entry->data)->name, block_name) == 0);
 
    mtx_unlock(&glsl_type::mutex);
 
-   return t;
+   return (glsl_type *) entry->data;
+}
+
+const glsl_type *
+glsl_type::get_subroutine_instance(const char *subroutine_name)
+{
+   const glsl_type key(subroutine_name);
+
+   mtx_lock(&glsl_type::mutex);
+
+   if (subroutine_types == NULL) {
+      subroutine_types = _mesa_hash_table_create(NULL, record_key_hash,
+                                                 record_key_compare);
+   }
+
+   const struct hash_entry *entry = _mesa_hash_table_search(subroutine_types,
+                                                            &key);
+   if (entry == NULL) {
+      mtx_unlock(&glsl_type::mutex);
+      const glsl_type *t = new glsl_type(subroutine_name);
+      mtx_lock(&glsl_type::mutex);
+
+      entry = _mesa_hash_table_insert(subroutine_types, t, (void *) t);
+   }
+
+   assert(((glsl_type *) entry->data)->base_type == GLSL_TYPE_SUBROUTINE);
+   assert(strcmp(((glsl_type *) entry->data)->name, subroutine_name) == 0);
+
+   mtx_unlock(&glsl_type::mutex);
+
+   return (glsl_type *) entry->data;
 }
 
 
-static int
+static bool
 function_key_compare(const void *a, const void *b)
 {
    const glsl_type *const key1 = (glsl_type *) a;
@@ -875,7 +948,7 @@ function_key_compare(const void *a, const void *b)
 }
 
 
-static unsigned
+static uint32_t
 function_key_hash(const void *a)
 {
    const glsl_type *const key = (glsl_type *) a;
@@ -892,7 +965,7 @@ function_key_hash(const void *a)
 		       "%p", (void *) key->fields.structure[i].type);
    }
 
-   return hash_table_string_hash(& hash_key);
+   return _mesa_hash_string(hash_key);
 }
 
 const glsl_type *
@@ -905,19 +978,21 @@ glsl_type::get_function_instance(const glsl_type *return_type,
    mtx_lock(&glsl_type::mutex);
 
    if (function_types == NULL) {
-      function_types = hash_table_ctor(64, function_key_hash,
-                                       function_key_compare);
+      function_types = _mesa_hash_table_create(NULL, function_key_hash,
+                                               function_key_compare);
    }
 
-   const glsl_type *t = (glsl_type *) hash_table_find(function_types, &key);
-   if (t == NULL) {
+   struct hash_entry *entry = _mesa_hash_table_search(function_types, &key);
+   if (entry == NULL) {
       mtx_unlock(&glsl_type::mutex);
-      t = new glsl_type(return_type, params, num_params);
+      const glsl_type *t = new glsl_type(return_type, params, num_params);
       mtx_lock(&glsl_type::mutex);
 
-      hash_table_insert(function_types, (void *) t, t);
+      _mesa_hash_table_insert(function_types, t, (void *) t);
    }
 
+   const glsl_type *t = (const glsl_type *)entry->data;
+
    assert(t->base_type == GLSL_TYPE_FUNCTION);
    assert(t->length == num_params);
 
@@ -1054,7 +1129,8 @@ glsl_type::component_slots() const
 
    case GLSL_TYPE_IMAGE:
       return 1;
-
+   case GLSL_TYPE_SUBROUTINE:
+     return 1;
    case GLSL_TYPE_FUNCTION:
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_ATOMIC_UINT:
@@ -1079,6 +1155,7 @@ glsl_type::uniform_locations() const
    case GLSL_TYPE_BOOL:
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SUBROUTINE:
       return 1;
 
    case GLSL_TYPE_STRUCT:
@@ -1187,7 +1264,8 @@ glsl_type::std140_base_alignment(bool row_major) const
 	  this->fields.array->is_matrix()) {
 	 return MAX2(this->fields.array->std140_base_alignment(row_major), 16);
       } else {
-	 assert(this->fields.array->is_record());
+	 assert(this->fields.array->is_record() ||
+                this->fields.array->is_array());
 	 return this->fields.array->std140_base_alignment(row_major);
       }
    }
@@ -1432,6 +1510,7 @@ glsl_type::count_attribute_slots() const
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
+   case GLSL_TYPE_SUBROUTINE:
    case GLSL_TYPE_ERROR:
       break;
    }
diff --git a/src/glsl/glsl_types.h b/src/glsl/glsl_types.h
index 836259a506c..28e2e93a305 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -60,6 +60,7 @@ enum glsl_base_type {
    GLSL_TYPE_INTERFACE,
    GLSL_TYPE_ARRAY,
    GLSL_TYPE_VOID,
+   GLSL_TYPE_SUBROUTINE,
    GLSL_TYPE_ERROR
 };
 
@@ -270,6 +271,10 @@ struct glsl_type {
    static const glsl_type *get_function_instance(const struct glsl_type *return_type,
                                                  const glsl_function_param *parameters,
                                                  unsigned num_params);
+   /**
+    * Get the instance of an subroutine type
+    */
+   static const glsl_type *get_subroutine_instance(const char *subroutine_name);
 
    /**
     * Get the type resulting from a multiplication of \p type_a * \p type_b
@@ -522,6 +527,13 @@ struct glsl_type {
    /**
     * Query if a type is unnamed/anonymous (named by the parser)
     */
+
+   bool is_subroutine() const
+   {
+      return base_type == GLSL_TYPE_SUBROUTINE;
+   }
+   bool contains_subroutine() const;
+
    bool is_anonymous() const
    {
       return !strncmp(name, "#anon", 5);
@@ -691,6 +703,9 @@ private:
    /** Constructor for array types */
    glsl_type(const glsl_type *array, unsigned length);
 
+   /** Constructor for subroutine types */
+   glsl_type(const char *name);
+
    /** Hash table containing the known array types. */
    static struct hash_table *array_types;
 
@@ -703,7 +718,10 @@ private:
    /** Hash table containing the known function types. */
    static struct hash_table *function_types;
 
-   static int record_key_compare(const void *a, const void *b);
+   /** Hash table containing the known subroutine types. */
+   static struct hash_table *subroutine_types;
+
+   static bool record_key_compare(const void *a, const void *b);
    static unsigned record_key_hash(const void *key);
 
    /**
@@ -770,11 +788,32 @@ struct glsl_struct_field {
     */
    unsigned matrix_layout:2;
 
+   /**
+    * For interface blocks, 1 if this variable is a per-patch input or output
+    * (as in ir_variable::patch). 0 otherwise.
+    */
+   unsigned patch:1;
+
    /**
     * For interface blocks, it has a value if this variable uses multiple vertex
     * streams (as in ir_variable::stream). -1 otherwise.
     */
    int stream;
+
+#ifdef __cplusplus
+   glsl_struct_field(const struct glsl_type *_type, const char *_name)
+      : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
+        sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0),
+        stream(-1)
+   {
+      /* empty */
+   }
+
+   glsl_struct_field()
+   {
+      /* empty */
+   }
+#endif
 };
 
 struct glsl_function_param {
diff --git a/src/glsl/hir_field_selection.cpp b/src/glsl/hir_field_selection.cpp
index 0fa976811e6..337095b95b8 100644
--- a/src/glsl/hir_field_selection.cpp
+++ b/src/glsl/hir_field_selection.cpp
@@ -56,45 +56,6 @@ _mesa_ast_field_selection_to_hir(const ast_expression *expr,
 			  "structure",
 			  expr->primary_expression.identifier);
       }
-   } else if (expr->subexpressions[1] != NULL) {
-      /* Handle "method calls" in GLSL 1.20 - namely, array.length() */
-      state->check_version(120, 300, &loc, "methods not supported");
-
-      ast_expression *call = expr->subexpressions[1];
-      assert(call->oper == ast_function_call);
-
-      const char *method;
-      method = call->subexpressions[0]->primary_expression.identifier;
-
-      if (strcmp(method, "length") == 0) {
-         if (!call->expressions.is_empty())
-            _mesa_glsl_error(&loc, state, "length method takes no arguments");
-
-         if (op->type->is_array()) {
-            if (op->type->is_unsized_array())
-               _mesa_glsl_error(&loc, state, "length called on unsized array");
-
-            result = new(ctx) ir_constant(op->type->array_size());
-         } else if (op->type->is_vector()) {
-            if (state->ARB_shading_language_420pack_enable) {
-               /* .length() returns int. */
-               result = new(ctx) ir_constant((int) op->type->vector_elements);
-            } else {
-               _mesa_glsl_error(&loc, state, "length method on matrix only available"
-                                             "with ARB_shading_language_420pack");
-            }
-         } else if (op->type->is_matrix()) {
-            if (state->ARB_shading_language_420pack_enable) {
-               /* .length() returns int. */
-               result = new(ctx) ir_constant((int) op->type->matrix_columns);
-            } else {
-               _mesa_glsl_error(&loc, state, "length method on matrix only available"
-                                             "with ARB_shading_language_420pack");
-            }
-         }
-      } else {
-	 _mesa_glsl_error(&loc, state, "unknown method: `%s'", method);
-      }
    } else if (op->type->is_vector() ||
               (state->ARB_shading_language_420pack_enable &&
                op->type->is_scalar())) {
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index dbd064feecc..724861b1e9f 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -63,8 +63,6 @@ update_rhs_swizzle(ir_swizzle_mask &m, unsigned from, unsigned to)
    case 3: m.w = from; break;
    default: assert(!"Should not get here.");
    }
-
-   m.num_components = MAX2(m.num_components, (to + 1));
 }
 
 void
@@ -95,6 +93,7 @@ ir_assignment::set_lhs(ir_rvalue *lhs)
 
 	 write_mask |= (((this->write_mask >> i) & 1) << c);
 	 update_rhs_swizzle(rhs_swiz, i, c);
+         rhs_swiz.num_components = swiz->val->type->vector_elements;
       }
 
       this->write_mask = write_mask;
@@ -114,6 +113,7 @@ ir_assignment::set_lhs(ir_rvalue *lhs)
 	 if (write_mask & (1 << i))
 	    update_rhs_swizzle(rhs_swiz, i, rhs_chan++);
       }
+      rhs_swiz.num_components = rhs_chan;
       this->rhs = new(mem_ctx) ir_swizzle(this->rhs, rhs_swiz);
    }
 
@@ -260,6 +260,7 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
    case ir_unop_bit_count:
    case ir_unop_find_msb:
    case ir_unop_find_lsb:
+   case ir_unop_subroutine_to_int:
       this->type = glsl_type::get_instance(GLSL_TYPE_INT,
 					   op0->type->vector_elements, 1);
       break;
@@ -568,6 +569,7 @@ static const char *const operator_strs[] = {
    "frexp_sig",
    "frexp_exp",
    "noise",
+   "subroutine_to_int",
    "interpolate_at_centroid",
    "+",
    "-",
@@ -1643,6 +1645,7 @@ ir_variable::ir_variable(const struct glsl_type *type, const char *name,
    this->data.read_only = false;
    this->data.centroid = false;
    this->data.sample = false;
+   this->data.patch = false;
    this->data.invariant = false;
    this->data.how_declared = ir_var_declared_normally;
    this->data.mode = mode;
@@ -1785,6 +1788,7 @@ ir_function_signature::qualifiers_match(exec_list *params)
 	  a->data.interpolation != b->data.interpolation ||
 	  a->data.centroid != b->data.centroid ||
           a->data.sample != b->data.sample ||
+          a->data.patch != b->data.patch ||
           a->data.image_read_only != b->data.image_read_only ||
           a->data.image_write_only != b->data.image_write_only ||
           a->data.image_coherent != b->data.image_coherent ||
@@ -1851,6 +1855,7 @@ static void
 steal_memory(ir_instruction *ir, void *new_ctx)
 {
    ir_variable *var = ir->as_variable();
+   ir_function *fn = ir->as_function();
    ir_constant *constant = ir->as_constant();
    if (var != NULL && var->constant_value != NULL)
       steal_memory(var->constant_value, ir);
@@ -1858,6 +1863,9 @@ steal_memory(ir_instruction *ir, void *new_ctx)
    if (var != NULL && var->constant_initializer != NULL)
       steal_memory(var->constant_initializer, ir);
 
+   if (fn != NULL && fn->subroutine_types)
+      ralloc_steal(new_ctx, fn->subroutine_types);
+
    /* The components of aggregate constants are not visited by the normal
     * visitor, so steal their values by hand.
     */
@@ -1975,6 +1983,9 @@ mode_string(const ir_variable *var)
    case ir_var_uniform:
       return "uniform";
 
+   case ir_var_shader_storage:
+      return "buffer";
+
    case ir_var_shader_in:
       return "shader input";
 
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 5af029b9765..7aac9af9001 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -324,6 +324,7 @@ protected:
 enum ir_variable_mode {
    ir_var_auto = 0,     /**< Function local variables and globals. */
    ir_var_uniform,      /**< Variable declared as a uniform. */
+   ir_var_shader_storage,   /**< Variable declared as an ssbo. */
    ir_var_shader_in,
    ir_var_shader_out,
    ir_var_function_in,
@@ -441,11 +442,14 @@ public:
    glsl_interp_qualifier determine_interpolation_mode(bool flat_shade);
 
    /**
-    * Determine whether or not a variable is part of a uniform block.
+    * Determine whether or not a variable is part of a uniform or
+    * shader storage block.
     */
-   inline bool is_in_uniform_block() const
+   inline bool is_in_buffer_block() const
    {
-      return this->data.mode == ir_var_uniform && this->interface_type != NULL;
+      return (this->data.mode == ir_var_uniform ||
+              this->data.mode == ir_var_shader_storage) &&
+             this->interface_type != NULL;
    }
 
    /**
@@ -618,6 +622,7 @@ public:
       unsigned read_only:1;
       unsigned centroid:1;
       unsigned sample:1;
+      unsigned patch:1;
       unsigned invariant:1;
       unsigned precise:1;
 
@@ -1133,6 +1138,21 @@ public:
     * List of ir_function_signature for each overloaded function with this name.
     */
    struct exec_list signatures;
+
+   /**
+    * is this function a subroutine type declaration
+    * e.g. subroutine void type1(float arg1);
+    */
+   bool is_subroutine;
+
+   /**
+    * is this function associated to a subroutine type
+    * e.g. subroutine (type1, type2) function_name { function_body };
+    * would have num_subroutine_types 2,
+    * and pointers to the type1 and type2 types.
+    */
+   int num_subroutine_types;
+   const struct glsl_type **subroutine_types;
 };
 
 inline const char *ir_function_signature::function_name() const
@@ -1392,6 +1412,7 @@ enum ir_expression_operation {
 
    ir_unop_noise,
 
+   ir_unop_subroutine_to_int,
    /**
     * Interpolate fs input at centroid
     *
@@ -1703,7 +1724,18 @@ public:
    ir_call(ir_function_signature *callee,
 	   ir_dereference_variable *return_deref,
 	   exec_list *actual_parameters)
-      : ir_instruction(ir_type_call), return_deref(return_deref), callee(callee)
+      : ir_instruction(ir_type_call), return_deref(return_deref), callee(callee), sub_var(NULL), array_idx(NULL)
+   {
+      assert(callee->return_type != NULL);
+      actual_parameters->move_nodes_to(& this->actual_parameters);
+      this->use_builtin = callee->is_builtin();
+   }
+
+   ir_call(ir_function_signature *callee,
+	   ir_dereference_variable *return_deref,
+	   exec_list *actual_parameters,
+	   ir_variable *var, ir_rvalue *array_idx)
+      : ir_instruction(ir_type_call), return_deref(return_deref), callee(callee), sub_var(var), array_idx(array_idx)
    {
       assert(callee->return_type != NULL);
       actual_parameters->move_nodes_to(& this->actual_parameters);
@@ -1751,6 +1783,14 @@ public:
 
    /** Should this call only bind to a built-in function? */
    bool use_builtin;
+
+   /*
+    * ARB_shader_subroutine support -
+    * the subroutine uniform variable and array index
+    * rvalue to be used in the lowering pass later.
+    */
+   ir_variable *sub_var;
+   ir_rvalue *array_idx;
 };
 
 
diff --git a/src/glsl/ir_builder.cpp b/src/glsl/ir_builder.cpp
index e44b05c991c..cd03859cac0 100644
--- a/src/glsl/ir_builder.cpp
+++ b/src/glsl/ir_builder.cpp
@@ -338,6 +338,12 @@ sign(operand a)
    return expr(ir_unop_sign, a);
 }
 
+ir_expression *
+subr_to_int(operand a)
+{
+   return expr(ir_unop_subroutine_to_int, a);
+}
+
 ir_expression*
 equal(operand a, operand b)
 {
diff --git a/src/glsl/ir_builder.h b/src/glsl/ir_builder.h
index 87026588113..f76453ffcf0 100644
--- a/src/glsl/ir_builder.h
+++ b/src/glsl/ir_builder.h
@@ -153,6 +153,7 @@ ir_expression *sqrt(operand a);
 ir_expression *log(operand a);
 ir_expression *sign(operand a);
 
+ir_expression *subr_to_int(operand a);
 ir_expression *equal(operand a, operand b);
 ir_expression *nequal(operand a, operand b);
 ir_expression *less(operand a, operand b);
diff --git a/src/glsl/ir_clone.cpp b/src/glsl/ir_clone.cpp
index 636c143ddc2..4edf70dba5d 100644
--- a/src/glsl/ir_clone.cpp
+++ b/src/glsl/ir_clone.cpp
@@ -267,6 +267,12 @@ ir_function::clone(void *mem_ctx, struct hash_table *ht) const
 {
    ir_function *copy = new(mem_ctx) ir_function(this->name);
 
+   copy->is_subroutine = this->is_subroutine;
+   copy->num_subroutine_types = this->num_subroutine_types;
+   copy->subroutine_types = ralloc_array(mem_ctx, const struct glsl_type *, copy->num_subroutine_types);
+   for (int i = 0; i < copy->num_subroutine_types; i++)
+     copy->subroutine_types[i] = this->subroutine_types[i];
+
    foreach_in_list(const ir_function_signature, sig, &this->signatures) {
       ir_function_signature *sig_copy = sig->clone(mem_ctx, ht);
       copy->add_signature(sig_copy);
@@ -363,6 +369,7 @@ ir_constant::clone(void *mem_ctx, struct hash_table *ht) const
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_SUBROUTINE:
    case GLSL_TYPE_INTERFACE:
       assert(!"Should not get here.");
       break;
diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 171b8e95444..309b6b72b5b 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -40,12 +40,7 @@
 #include "glsl_types.h"
 #include "program/hash_table.h"
 
-#if defined(_MSC_VER) && (_MSC_VER < 1800)
-static int isnormal(double x)
-{
-   return _fpclass(x) == _FPCLASS_NN || _fpclass(x) == _FPCLASS_PN;
-}
-#elif defined(__SUNPRO_CC) && !defined(isnormal)
+#if defined(__SUNPRO_CC) && !defined(isnormal)
 #include <ieeefp.h>
 static int isnormal(double x)
 {
@@ -53,13 +48,6 @@ static int isnormal(double x)
 }
 #endif
 
-#if defined(_MSC_VER)
-static double copysign(double x, double y)
-{
-   return _copysign(x, y);
-}
-#endif
-
 static float
 dot_f(ir_constant *op0, ir_constant *op1)
 {
@@ -242,12 +230,9 @@ pack_snorm_1x8(float x)
      *    follows:
      *
      *      packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
-     *
-     * We must first cast the float to an int, because casting a negative
-     * float to a uint is undefined.
      */
-   return (uint8_t) (int)
-          _mesa_roundevenf(CLAMP(x, -1.0f, +1.0f) * 127.0f);
+   return (uint8_t)
+          _mesa_lroundevenf(CLAMP(x, -1.0f, +1.0f) * 127.0f);
 }
 
 /**
@@ -264,12 +249,9 @@ pack_snorm_1x16(float x)
      *    follows:
      *
      *      packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
-     *
-     * We must first cast the float to an int, because casting a negative
-     * float to a uint is undefined.
      */
-   return (uint16_t) (int)
-          _mesa_roundevenf(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
+   return (uint16_t)
+          _mesa_lroundevenf(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
 }
 
 /**
@@ -1674,10 +1656,10 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
             if (!isnormal(data.d[c]))
                data.d[c] = copysign(0.0, op[0]->value.d[c]);
          } else {
-            data.f[c] = ldexp(op[0]->value.f[c], op[1]->value.i[c]);
+            data.f[c] = ldexpf(op[0]->value.f[c], op[1]->value.i[c]);
             /* Flush subnormal values to zero. */
             if (!isnormal(data.f[c]))
-               data.f[c] = copysign(0.0f, op[0]->value.f[c]);
+               data.f[c] = copysignf(0.0f, op[0]->value.f[c]);
          }
       }
       break;
diff --git a/src/glsl/ir_function.cpp b/src/glsl/ir_function.cpp
index 13194439003..93034bedb5a 100644
--- a/src/glsl/ir_function.cpp
+++ b/src/glsl/ir_function.cpp
@@ -72,6 +72,7 @@ parameter_lists_match(_mesa_glsl_parse_state *state,
       switch ((enum ir_variable_mode)(param->data.mode)) {
       case ir_var_auto:
       case ir_var_uniform:
+      case ir_var_shader_storage:
       case ir_var_temporary:
 	 /* These are all error conditions.  It is invalid for a parameter to
 	  * a function to be declared as auto (not in, out, or inout) or
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index e6939f3fe1f..eef107e5249 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -114,12 +114,13 @@ bool lower_discard(exec_list *instructions);
 void lower_discard_flow(exec_list *instructions);
 bool lower_instructions(exec_list *instructions, unsigned what_to_lower);
 bool lower_noise(exec_list *instructions);
-bool lower_variable_index_to_cond_assign(exec_list *instructions,
-    bool lower_input, bool lower_output, bool lower_temp, bool lower_uniform);
+bool lower_variable_index_to_cond_assign(gl_shader_stage stage,
+    exec_list *instructions, bool lower_input, bool lower_output,
+    bool lower_temp, bool lower_uniform);
 bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz);
 bool lower_const_arrays_to_uniforms(exec_list *instructions);
 bool lower_clip_distance(gl_shader *shader);
-void lower_output_reads(exec_list *instructions);
+void lower_output_reads(unsigned stage, exec_list *instructions);
 bool lower_packing_builtins(exec_list *instructions, int op_mask);
 void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions);
 void lower_packed_varyings(void *mem_ctx,
@@ -132,9 +133,12 @@ bool optimize_split_arrays(exec_list *instructions, bool linked);
 bool lower_offset_arrays(exec_list *instructions);
 void optimize_dead_builtin_variables(exec_list *instructions,
                                      enum ir_variable_mode other);
+bool lower_tess_level(gl_shader *shader);
 
 bool lower_vertex_id(gl_shader *shader);
 
+bool lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state);
+
 ir_rvalue *
 compare_index_block(exec_list *instructions, ir_variable *index,
 		    unsigned base, unsigned components, void *mem_ctx);
diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp
index 4cbcad4ec61..8dbd938c58b 100644
--- a/src/glsl/ir_print_visitor.cpp
+++ b/src/glsl/ir_print_visitor.cpp
@@ -25,7 +25,7 @@
 #include "glsl_types.h"
 #include "glsl_parser_extras.h"
 #include "main/macros.h"
-#include "program/hash_table.h"
+#include "util/hash_table.h"
 
 static void print_type(FILE *f, const glsl_type *t);
 
@@ -89,14 +89,14 @@ ir_print_visitor::ir_print_visitor(FILE *f)
 {
    indentation = 0;
    printable_names =
-      hash_table_ctor(32, hash_table_pointer_hash, hash_table_pointer_compare);
+      _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
    symbols = _mesa_symbol_table_ctor();
    mem_ctx = ralloc_context(NULL);
 }
 
 ir_print_visitor::~ir_print_visitor()
 {
-   hash_table_dtor(printable_names);
+   _mesa_hash_table_destroy(printable_names, NULL);
    _mesa_symbol_table_dtor(symbols);
    ralloc_free(mem_ctx);
 }
@@ -121,18 +121,22 @@ ir_print_visitor::unique_name(ir_variable *var)
    }
 
    /* Do we already have a name for this variable? */
-   const char *name = (const char *) hash_table_find(this->printable_names, var);
-   if (name != NULL)
-      return name;
+   struct hash_entry * entry =
+      _mesa_hash_table_search(this->printable_names, var);
+
+   if (entry != NULL) {
+      return (const char *) entry->data;
+   }
 
    /* If there's no conflict, just use the original name */
+   const char* name = NULL;
    if (_mesa_symbol_table_find_symbol(this->symbols, -1, var->name) == NULL) {
       name = var->name;
    } else {
       static unsigned i = 1;
       name = ralloc_asprintf(this->mem_ctx, "%s@%u", var->name, ++i);
    }
-   hash_table_insert(this->printable_names, (void *) name, var);
+   _mesa_hash_table_insert(this->printable_names, var, (void *) name);
    _mesa_symbol_table_add_symbol(this->symbols, -1, name, var);
    return name;
 }
@@ -167,8 +171,10 @@ void ir_print_visitor::visit(ir_variable *ir)
 
    const char *const cent = (ir->data.centroid) ? "centroid " : "";
    const char *const samp = (ir->data.sample) ? "sample " : "";
+   const char *const patc = (ir->data.patch) ? "patch " : "";
    const char *const inv = (ir->data.invariant) ? "invariant " : "";
-   const char *const mode[] = { "", "uniform ", "shader_in ", "shader_out ",
+   const char *const mode[] = { "", "uniform ", "shader_storage",
+                                "shader_in ", "shader_out ",
                                 "in ", "out ", "inout ",
 			        "const_in ", "sys ", "temporary " };
    STATIC_ASSERT(ARRAY_SIZE(mode) == ir_var_mode_count);
@@ -176,8 +182,8 @@ void ir_print_visitor::visit(ir_variable *ir)
    const char *const interp[] = { "", "smooth", "flat", "noperspective" };
    STATIC_ASSERT(ARRAY_SIZE(interp) == INTERP_QUALIFIER_COUNT);
 
-   fprintf(f, "(%s%s%s%s%s%s%s) ",
-           loc, cent, samp, inv, mode[ir->data.mode],
+   fprintf(f, "(%s%s%s%s%s%s%s%s) ",
+           loc, cent, samp, patc, inv, mode[ir->data.mode],
            stream[ir->data.stream],
            interp[ir->data.interpolation]);
 
@@ -229,7 +235,7 @@ void ir_print_visitor::visit(ir_function_signature *ir)
 
 void ir_print_visitor::visit(ir_function *ir)
 {
-   fprintf(f, "(function %s\n", ir->name);
+   fprintf(f, "(%s function %s\n", ir->is_subroutine ? "subroutine" : "", ir->name);
    indentation++;
    foreach_in_list(ir_function_signature, sig, &ir->signatures) {
       indent();
diff --git a/src/glsl/ir_reader.cpp b/src/glsl/ir_reader.cpp
index 4eae4131c57..469837f5e4c 100644
--- a/src/glsl/ir_reader.cpp
+++ b/src/glsl/ir_reader.cpp
@@ -417,10 +417,14 @@ ir_reader::read_declaration(s_expression *expr)
 	 var->data.centroid = 1;
       } else if (strcmp(qualifier->value(), "sample") == 0) {
          var->data.sample = 1;
+      } else if (strcmp(qualifier->value(), "patch") == 0) {
+         var->data.patch = 1;
       } else if (strcmp(qualifier->value(), "invariant") == 0) {
 	 var->data.invariant = 1;
       } else if (strcmp(qualifier->value(), "uniform") == 0) {
 	 var->data.mode = ir_var_uniform;
+      } else if (strcmp(qualifier->value(), "shader_storage") == 0) {
+	 var->data.mode = ir_var_shader_storage;
       } else if (strcmp(qualifier->value(), "auto") == 0) {
 	 var->data.mode = ir_var_auto;
       } else if (strcmp(qualifier->value(), "in") == 0) {
diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp
index b968a1efd3e..b7a0f6e95ba 100644
--- a/src/glsl/ir_set_program_inouts.cpp
+++ b/src/glsl/ir_set_program_inouts.cpp
@@ -103,10 +103,26 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
    for (int i = 0; i < len; i++) {
       bool dual_slot = is_dual_slot(var);
       int idx = var->data.location + var->data.index + offset + i;
-      GLbitfield64 bitfield = BITFIELD64_BIT(idx);
+      bool is_patch_generic = var->data.patch &&
+                              idx != VARYING_SLOT_TESS_LEVEL_INNER &&
+                              idx != VARYING_SLOT_TESS_LEVEL_OUTER;
+      GLbitfield64 bitfield;
+
+      if (is_patch_generic) {
+         assert(idx >= VARYING_SLOT_PATCH0 && idx < VARYING_SLOT_TESS_MAX);
+         bitfield = BITFIELD64_BIT(idx - VARYING_SLOT_PATCH0);
+      }
+      else {
+         assert(idx < VARYING_SLOT_MAX);
+         bitfield = BITFIELD64_BIT(idx);
+      }
 
       if (var->data.mode == ir_var_shader_in) {
-         prog->InputsRead |= bitfield;
+         if (is_patch_generic)
+            prog->PatchInputsRead |= bitfield;
+         else
+            prog->InputsRead |= bitfield;
+
          if (dual_slot)
             prog->DoubleInputsRead |= bitfield;
          if (is_fragment_shader) {
@@ -122,7 +138,10 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
          prog->SystemValuesRead |= bitfield;
       } else {
          assert(var->data.mode == ir_var_shader_out);
-	 prog->OutputsWritten |= bitfield;
+         if (is_patch_generic)
+            prog->PatchOutputsWritten |= bitfield;
+         else
+            prog->OutputsWritten |= bitfield;
       }
    }
 }
@@ -140,6 +159,24 @@ ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var)
       type = type->fields.array;
    }
 
+   if (this->shader_stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_in) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   if (this->shader_stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_out && !var->data.patch) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   if (this->shader_stage == MESA_SHADER_TESS_EVAL &&
+       var->data.mode == ir_var_shader_in && !var->data.patch) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
    mark(this->prog, var, 0, type->count_attribute_slots(),
         this->shader_stage == MESA_SHADER_FRAGMENT);
 }
@@ -165,6 +202,9 @@ ir_set_program_inouts_visitor::visit(ir_dereference_variable *ir)
  *
  * *Except gl_PrimitiveIDIn, as noted below.
  *
+ * For tessellation control shaders all inputs and non-patch outputs are
+ * arrays. For tessellation evaluation shaders non-patch inputs are arrays.
+ *
  * If the index can't be interpreted as a constant, or some other problem
  * occurs, then nothing will be marked and false will be returned.
  */
@@ -184,6 +224,24 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
       type = type->fields.array;
    }
 
+   if (this->shader_stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_in) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   if (this->shader_stage == MESA_SHADER_TESS_CTRL &&
+       var->data.mode == ir_var_shader_out && !var->data.patch) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
+   if (this->shader_stage == MESA_SHADER_TESS_EVAL &&
+       var->data.mode == ir_var_shader_in && !var->data.patch) {
+      assert(type->is_array());
+      type = type->fields.array;
+   }
+
    /* The code below only handles:
     *
     * - Indexing into matrices
@@ -242,6 +300,22 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
    return true;
 }
 
+static bool
+is_multiple_vertices(gl_shader_stage stage, ir_variable *var)
+{
+   if (var->data.patch)
+      return false;
+
+   if (var->data.mode == ir_var_shader_in)
+      return stage == MESA_SHADER_GEOMETRY ||
+             stage == MESA_SHADER_TESS_CTRL ||
+             stage == MESA_SHADER_TESS_EVAL;
+   if (var->data.mode == ir_var_shader_out)
+      return stage == MESA_SHADER_TESS_CTRL;
+
+   return false;
+}
+
 ir_visitor_status
 ir_set_program_inouts_visitor::visit_enter(ir_dereference_array *ir)
 {
@@ -256,10 +330,9 @@ ir_set_program_inouts_visitor::visit_enter(ir_dereference_array *ir)
        */
       if (ir_dereference_variable * const deref_var =
           inner_array->array->as_dereference_variable()) {
-         if (this->shader_stage == MESA_SHADER_GEOMETRY &&
-             deref_var->var->data.mode == ir_var_shader_in) {
-            /* foo is a geometry shader input, so i is the vertex, and j the
-             * part of the input we're accessing.
+         if (is_multiple_vertices(this->shader_stage, deref_var->var)) {
+            /* foo is a geometry or tessellation shader input, so i is
+             * the vertex, and j the part of the input we're accessing.
              */
             if (try_mark_partial_variable(deref_var->var, ir->array_index))
             {
@@ -275,10 +348,9 @@ ir_set_program_inouts_visitor::visit_enter(ir_dereference_array *ir)
    } else if (ir_dereference_variable * const deref_var =
               ir->array->as_dereference_variable()) {
       /* ir => foo[i], where foo is a variable. */
-      if (this->shader_stage == MESA_SHADER_GEOMETRY &&
-          deref_var->var->data.mode == ir_var_shader_in) {
-         /* foo is a geometry shader input, so i is the vertex, and we're
-          * accessing the entire input.
+      if (is_multiple_vertices(this->shader_stage, deref_var->var)) {
+         /* foo is a geometry or tessellation shader input, so i is
+          * the vertex, and we're accessing the entire input.
           */
          mark_whole_variable(deref_var->var);
          /* We've now taken care of foo, but i might contain a subexpression
@@ -353,6 +425,8 @@ do_set_program_inouts(exec_list *instructions, struct gl_program *prog,
 
    prog->InputsRead = 0;
    prog->OutputsWritten = 0;
+   prog->PatchInputsRead = 0;
+   prog->PatchOutputsWritten = 0;
    prog->SystemValuesRead = 0;
    if (shader_stage == MESA_SHADER_FRAGMENT) {
       gl_fragment_program *fprog = (gl_fragment_program *) prog;
diff --git a/src/glsl/ir_uniform.h b/src/glsl/ir_uniform.h
index e1b80147788..0b6f7201a20 100644
--- a/src/glsl/ir_uniform.h
+++ b/src/glsl/ir_uniform.h
@@ -114,6 +114,8 @@ struct gl_uniform_storage {
 
    struct gl_opaque_uniform_index image[MESA_SHADER_STAGES];
 
+   struct gl_opaque_uniform_index subroutine[MESA_SHADER_STAGES];
+
    /**
     * Storage used by the driver for the uniform
     */
@@ -173,9 +175,15 @@ struct gl_uniform_storage {
    /**
     * The 'base location' for this uniform in the uniform remap table. For
     * arrays this is the first element in the array.
+    * for subroutines this is in shader subroutine uniform remap table.
     */
    unsigned remap_location;
 
+   /**
+    * The number of compatible subroutines with this subroutine uniform.
+    */
+   unsigned num_compatible_subroutines;
+
    /**
     * This is a compiler-generated uniform that should not be advertised
     * via the API.
diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
index cfe0df3dca6..3f0dea74e27 100644
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -35,7 +35,8 @@
 
 #include "ir.h"
 #include "ir_hierarchical_visitor.h"
-#include "program/hash_table.h"
+#include "util/hash_table.h"
+#include "util/set.h"
 #include "glsl_types.h"
 
 namespace {
@@ -44,18 +45,18 @@ class ir_validate : public ir_hierarchical_visitor {
 public:
    ir_validate()
    {
-      this->ht = hash_table_ctor(0, hash_table_pointer_hash,
-				 hash_table_pointer_compare);
+      this->ir_set = _mesa_set_create(NULL, _mesa_hash_pointer,
+                                      _mesa_key_pointer_equal);
 
       this->current_function = NULL;
 
       this->callback_enter = ir_validate::validate_ir;
-      this->data_enter = ht;
+      this->data_enter = ir_set;
    }
 
    ~ir_validate()
    {
-      hash_table_dtor(this->ht);
+      _mesa_set_destroy(this->ir_set, NULL);
    }
 
    virtual ir_visitor_status visit(ir_variable *v);
@@ -80,7 +81,7 @@ public:
 
    ir_function *current_function;
 
-   struct hash_table *ht;
+   struct set *ir_set;
 };
 
 } /* anonymous namespace */
@@ -94,7 +95,7 @@ ir_validate::visit(ir_dereference_variable *ir)
       abort();
    }
 
-   if (hash_table_find(ht, ir->var) == NULL) {
+   if (_mesa_set_search(ir_set, ir->var) == NULL) {
       printf("ir_dereference_variable @ %p specifies undeclared variable "
 	     "`%s' @ %p\n",
 	     (void *) ir, ir->var->name, (void *) ir->var);
@@ -447,6 +448,10 @@ ir_validate::visit_leave(ir_expression *ir)
              ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE);
       assert(ir->type->base_type == GLSL_TYPE_INT);
       break;
+   case ir_unop_subroutine_to_int:
+      assert(ir->operands[0]->type->base_type == GLSL_TYPE_SUBROUTINE);
+      assert(ir->type->base_type == GLSL_TYPE_INT);
+      break;
    case ir_binop_add:
    case ir_binop_sub:
    case ir_binop_mul:
@@ -730,8 +735,7 @@ ir_validate::visit(ir_variable *ir)
    if (ir->name && ir->is_name_ralloced())
       assert(ralloc_parent(ir->name) == ir);
 
-   hash_table_insert(ht, ir, ir);
-
+   _mesa_set_add(ir_set, ir);
 
    /* If a variable is an array, verify that the maximum array index is in
     * bounds.  There was once an error in AST-to-HIR conversion that set this
@@ -885,15 +889,15 @@ dump_ir:
 void
 ir_validate::validate_ir(ir_instruction *ir, void *data)
 {
-   struct hash_table *ht = (struct hash_table *) data;
+   struct set *ir_set = (struct set *) data;
 
-   if (hash_table_find(ht, ir)) {
+   if (_mesa_set_search(ir_set, ir)) {
       printf("Instruction node present twice in ir tree:\n");
       ir->print();
       printf("\n");
       abort();
    }
-   hash_table_insert(ht, ir, ir);
+   _mesa_set_add(ir_set, ir);
 }
 
 void
diff --git a/src/glsl/link_interface_blocks.cpp b/src/glsl/link_interface_blocks.cpp
index 07f5b4223a8..936e2e0ba21 100644
--- a/src/glsl/link_interface_blocks.cpp
+++ b/src/glsl/link_interface_blocks.cpp
@@ -112,7 +112,8 @@ intrastage_match(interface_block_definition *a,
     * it's not clear from the spec whether they need to match, but
     * Mesa's implementation relies on them matching.
     */
-   if (a->instance_name != NULL && mode != ir_var_uniform &&
+   if (a->instance_name != NULL &&
+       mode != ir_var_uniform && mode != ir_var_shader_storage &&
        strcmp(a->instance_name, b->instance_name) != 0) {
       return false;
    }
@@ -133,9 +134,9 @@ intrastage_match(interface_block_definition *a,
  * Check if two interfaces match, according to interstage (in/out) interface
  * matching rules.
  *
- * If \c extra_array_level is true, then vertex-to-geometry shader matching
- * rules are enforced (i.e. a successful match requires the consumer interface
- * to be an array and the producer interface to be a non-array).
+ * If \c extra_array_level is true, the consumer interface is required to be
+ * an array and the producer interface is required to be a non-array.
+ * This is used for tessellation control and geometry shader consumers.
  */
 bool
 interstage_match(const interface_block_definition *producer,
@@ -253,6 +254,7 @@ validate_intrastage_interface_blocks(struct gl_shader_program *prog,
    interface_block_definitions in_interfaces;
    interface_block_definitions out_interfaces;
    interface_block_definitions uniform_interfaces;
+   interface_block_definitions buffer_interfaces;
 
    for (unsigned int i = 0; i < num_shaders; i++) {
       if (shader_list[i] == NULL)
@@ -279,6 +281,9 @@ validate_intrastage_interface_blocks(struct gl_shader_program *prog,
          case ir_var_uniform:
             definitions = &uniform_interfaces;
             break;
+         case ir_var_shader_storage:
+            definitions = &buffer_interfaces;
+            break;
          default:
             /* Only in, out, and uniform interfaces are legal, so we should
              * never get here.
@@ -313,7 +318,10 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog,
                                  const gl_shader *consumer)
 {
    interface_block_definitions definitions;
-   const bool extra_array_level = consumer->Stage == MESA_SHADER_GEOMETRY;
+   /* VS -> GS, VS -> TCS, VS -> TES, TES -> GS */
+   const bool extra_array_level = (producer->Stage == MESA_SHADER_VERTEX &&
+                                   consumer->Stage != MESA_SHADER_FRAGMENT) ||
+                                  consumer->Stage == MESA_SHADER_GEOMETRY;
 
    /* Add input interfaces from the consumer to the symbol table. */
    foreach_in_list(ir_instruction, node, consumer->ir) {
@@ -361,7 +369,9 @@ validate_interstage_uniform_blocks(struct gl_shader_program *prog,
       const gl_shader *stage = stages[i];
       foreach_in_list(ir_instruction, node, stage->ir) {
          ir_variable *var = node->as_variable();
-         if (!var || !var->get_interface_type() || var->data.mode != ir_var_uniform)
+         if (!var || !var->get_interface_type() ||
+             (var->data.mode != ir_var_uniform &&
+              var->data.mode != ir_var_shader_storage))
             continue;
 
          interface_block_definition *old_def =
@@ -374,7 +384,9 @@ validate_interstage_uniform_blocks(struct gl_shader_program *prog,
              * uniform matchin rules (for uniforms, it is as though all
              * shaders are in the same shader stage).
              */
-            if (!intrastage_match(old_def, &new_def, ir_var_uniform, prog)) {
+            if (!intrastage_match(old_def, &new_def,
+                                  (ir_variable_mode) var->data.mode,
+                                  prog)) {
                linker_error(prog, "definitions of interface block `%s' do not "
                             "match\n", var->get_interface_type()->name);
                return;
diff --git a/src/glsl/link_uniform_block_active_visitor.cpp b/src/glsl/link_uniform_block_active_visitor.cpp
index 701ca979b7f..981c1f75571 100644
--- a/src/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/glsl/link_uniform_block_active_visitor.cpp
@@ -44,6 +44,7 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
 
       b->type = block_type;
       b->has_instance_name = var->is_interface_instance();
+      b->is_shader_storage = var->data.mode == ir_var_shader_storage;
 
       if (var->data.explicit_binding) {
          b->has_binding = true;
@@ -78,7 +79,7 @@ process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
 ir_visitor_status
 link_uniform_block_active_visitor::visit(ir_variable *var)
 {
-   if (!var->is_in_uniform_block())
+   if (!var->is_in_buffer_block())
       return visit_continue;
 
    const glsl_type *const block_type = var->is_interface_instance()
@@ -129,7 +130,7 @@ link_uniform_block_active_visitor::visit_enter(ir_dereference_array *ir)
     * function.
     */
    if (var == NULL
-       || !var->is_in_uniform_block()
+       || !var->is_in_buffer_block()
        || !var->is_interface_instance())
       return visit_continue;
 
@@ -199,7 +200,7 @@ link_uniform_block_active_visitor::visit(ir_dereference_variable *ir)
 {
    ir_variable *var = ir->var;
 
-   if (!var->is_in_uniform_block())
+   if (!var->is_in_buffer_block())
       return visit_continue;
 
    assert(!var->is_interface_instance() || !var->type->is_array());
diff --git a/src/glsl/link_uniform_block_active_visitor.h b/src/glsl/link_uniform_block_active_visitor.h
index 148a3915abd..d8aefd69991 100644
--- a/src/glsl/link_uniform_block_active_visitor.h
+++ b/src/glsl/link_uniform_block_active_visitor.h
@@ -40,6 +40,7 @@ struct link_uniform_block_active {
 
    bool has_instance_name;
    bool has_binding;
+   bool is_shader_storage;
 };
 
 class link_uniform_block_active_visitor : public ir_hierarchical_visitor {
diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp
index c0d73076aa8..b80e5736f6b 100644
--- a/src/glsl/link_uniform_blocks.cpp
+++ b/src/glsl/link_uniform_blocks.cpp
@@ -295,6 +295,7 @@ link_uniform_blocks(void *mem_ctx,
 
             blocks[i].Set = b->set;
             blocks[i].Binding = b->binding;
+            blocks[i].IsShaderStorage = b->is_shader_storage;
 
             i++;
          }
@@ -316,6 +317,7 @@ link_uniform_blocks(void *mem_ctx,
 
          blocks[i].Set = b->set;
          blocks[i].Binding = b->binding;
+         blocks[i].IsShaderStorage = b->is_shader_storage;
 
          i++;
       }
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index 5f57079d1b8..c482fbfdfb2 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -90,6 +90,7 @@ copy_constant_to_storage(union gl_constant_value *storage,
       case GLSL_TYPE_INTERFACE:
       case GLSL_TYPE_FUNCTION:
       case GLSL_TYPE_VOID:
+      case GLSL_TYPE_SUBROUTINE:
       case GLSL_TYPE_ERROR:
 	 /* All other types should have already been filtered by other
 	  * paths in the caller.
@@ -257,7 +258,8 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
       foreach_in_list(ir_instruction, node, shader->ir) {
 	 ir_variable *const var = node->as_variable();
 
-	 if (!var || var->data.mode != ir_var_uniform)
+	 if (!var || (var->data.mode != ir_var_uniform &&
+	     var->data.mode != ir_var_shader_storage))
 	    continue;
 
 	 if (!mem_ctx)
@@ -268,7 +270,7 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
 
             if (type->without_array()->is_sampler()) {
                linker::set_sampler_binding(prog, var->name, var->data.binding);
-            } else if (var->is_in_uniform_block()) {
+            } else if (var->is_in_buffer_block()) {
                const glsl_type *const iface_type = var->get_interface_type();
 
                /* If the variable is an array and it is an interface instance,
@@ -281,7 +283,7 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
                 *         float f[4];
                 *     };
                 *
-                * In this case "f" would pass is_in_uniform_block (above) and
+                * In this case "f" would pass is_in_buffer_block (above) and
                 * type->is_array(), but it will fail is_interface_instance().
                 */
                if (var->is_interface_instance() && var->type->is_array()) {
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 11ae06f9bfb..254086dc050 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -47,9 +47,10 @@
 static unsigned
 values_for_type(const glsl_type *type)
 {
-   if (type->is_sampler()) {
+   if (type->is_sampler() || type->is_subroutine()) {
       return 1;
-   } else if (type->is_array() && type->fields.array->is_sampler()) {
+   } else if (type->is_array() && (type->fields.array->is_sampler() ||
+                                   type->fields.array->is_subroutine())) {
       return type->array_size();
    } else {
       return type->component_slots();
@@ -284,6 +285,7 @@ public:
    count_uniform_size(struct string_to_uint_map *map)
       : num_active_uniforms(0), num_values(0), num_shader_samplers(0),
         num_shader_images(0), num_shader_uniform_components(0),
+        num_shader_subroutines(0),
         is_ubo_var(false), map(map)
    {
       /* empty */
@@ -294,11 +296,12 @@ public:
       this->num_shader_samplers = 0;
       this->num_shader_images = 0;
       this->num_shader_uniform_components = 0;
+      this->num_shader_subroutines = 0;
    }
 
    void process(ir_variable *var)
    {
-      this->is_ubo_var = var->is_in_uniform_block();
+      this->is_ubo_var = var->is_in_buffer_block();
       if (var->is_interface_instance())
          program_resource_visitor::process(var->get_interface_type(),
                                            var->get_interface_type()->name);
@@ -331,6 +334,11 @@ public:
     */
    unsigned num_shader_uniform_components;
 
+   /**
+    * Number of subroutine uniforms used
+    */
+   unsigned num_shader_subroutines;
+
    bool is_ubo_var;
 
 private:
@@ -348,7 +356,9 @@ private:
        * count it for each shader target.
        */
       const unsigned values = values_for_type(type);
-      if (type->contains_sampler()) {
+      if (type->contains_subroutine()) {
+         this->num_shader_subroutines += values;
+      } else if (type->contains_sampler()) {
          this->num_shader_samplers += values;
       } else if (type->contains_image()) {
          this->num_shader_images += values;
@@ -421,6 +431,7 @@ public:
       this->shader_shadow_samplers = 0;
       this->next_sampler = 0;
       this->next_image = 0;
+      this->next_subroutine = 0;
       memset(this->targets, 0, sizeof(this->targets));
    }
 
@@ -431,7 +442,7 @@ public:
       field_counter = 0;
 
       ubo_block_index = -1;
-      if (var->is_in_uniform_block()) {
+      if (var->is_in_buffer_block()) {
          if (var->is_interface_instance() && var->type->is_array()) {
             unsigned l = strlen(var->get_interface_type()->name);
 
@@ -535,6 +546,24 @@ private:
       }
    }
 
+   void handle_subroutines(const glsl_type *base_type,
+                           struct gl_uniform_storage *uniform)
+   {
+      if (base_type->is_subroutine()) {
+         uniform->subroutine[shader_type].index = this->next_subroutine;
+         uniform->subroutine[shader_type].active = true;
+
+         /* Increment the subroutine index by 1 for non-arrays and by the
+          * number of array elements for arrays.
+          */
+         this->next_subroutine += MAX2(1, uniform->array_elements);
+
+      } else {
+         uniform->subroutine[shader_type].index = ~0;
+         uniform->subroutine[shader_type].active = false;
+      }
+   }
+
    virtual void visit_field(const glsl_type *type, const char *name,
                             bool row_major)
    {
@@ -588,6 +617,7 @@ private:
       /* This assigns uniform indices to sampler and image uniforms. */
       handle_samplers(base_type, &this->uniforms[id]);
       handle_images(base_type, &this->uniforms[id]);
+      handle_subroutines(base_type, &this->uniforms[id]);
 
       /* If there is already storage associated with this uniform or if the
        * uniform is set as builtin, it means that it was set while processing
@@ -672,6 +702,7 @@ private:
    struct gl_uniform_storage *uniforms;
    unsigned next_sampler;
    unsigned next_image;
+   unsigned next_subroutine;
 
 public:
    union gl_constant_value *values;
@@ -763,10 +794,11 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
    foreach_in_list(ir_instruction, node, shader->ir) {
       ir_variable *const var = node->as_variable();
 
-      if ((var == NULL) || !var->is_in_uniform_block())
+      if ((var == NULL) || !var->is_in_buffer_block())
 	 continue;
 
-      assert(var->data.mode == ir_var_uniform);
+      assert(var->data.mode == ir_var_uniform ||
+             var->data.mode == ir_var_shader_storage);
 
       if (var->is_interface_instance()) {
          var->data.location = 0;
@@ -943,7 +975,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       foreach_in_list(ir_instruction, node, sh->ir) {
 	 ir_variable *const var = node->as_variable();
 
-	 if ((var == NULL) || (var->data.mode != ir_var_uniform))
+	 if ((var == NULL) || (var->data.mode != ir_var_uniform &&
+	                       var->data.mode != ir_var_shader_storage))
 	    continue;
 
 	 uniform_size.process(var);
@@ -952,8 +985,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       sh->num_samplers = uniform_size.num_shader_samplers;
       sh->NumImages = uniform_size.num_shader_images;
       sh->num_uniform_components = uniform_size.num_shader_uniform_components;
-
       sh->num_combined_uniform_components = sh->num_uniform_components;
+
       for (unsigned i = 0; i < sh->NumUniformBlocks; i++) {
 	 sh->num_combined_uniform_components +=
 	    sh->UniformBlocks[i].UniformBufferSize / 4;
@@ -987,7 +1020,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       foreach_in_list(ir_instruction, node, prog->_LinkedShaders[i]->ir) {
 	 ir_variable *const var = node->as_variable();
 
-	 if ((var == NULL) || (var->data.mode != ir_var_uniform))
+	 if ((var == NULL) || (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage))
 	    continue;
 
 	 parcel.set_and_process(prog, var);
@@ -1006,6 +1039,9 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 
    /* Reserve all the explicit locations of the active uniforms. */
    for (unsigned i = 0; i < num_uniforms; i++) {
+      if (uniforms[i].type->is_subroutine())
+         continue;
+
       if (uniforms[i].remap_location != UNMAPPED_UNIFORM_LOC) {
          /* How many new entries for this uniform? */
          const unsigned entries = MAX2(1, uniforms[i].array_elements);
@@ -1023,6 +1059,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    /* Reserve locations for rest of the uniforms. */
    for (unsigned i = 0; i < num_uniforms; i++) {
 
+      if (uniforms[i].type->is_subroutine())
+         continue;
       /* Built-in uniforms should not get any location. */
       if (uniforms[i].builtin)
          continue;
@@ -1051,6 +1089,65 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       prog->NumUniformRemapTable += entries;
    }
 
+   /* Reserve all the explicit locations of the active subroutine uniforms. */
+   for (unsigned i = 0; i < num_uniforms; i++) {
+      if (!uniforms[i].type->is_subroutine())
+         continue;
+
+      if (uniforms[i].remap_location == UNMAPPED_UNIFORM_LOC)
+         continue;
+
+      for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
+         struct gl_shader *sh = prog->_LinkedShaders[j];
+         if (!sh)
+            continue;
+
+         if (!uniforms[i].subroutine[j].active)
+            continue;
+
+         /* How many new entries for this uniform? */
+         const unsigned entries = MAX2(1, uniforms[i].array_elements);
+
+         /* Set remap table entries point to correct gl_uniform_storage. */
+         for (unsigned k = 0; k < entries; k++) {
+            unsigned element_loc = uniforms[i].remap_location + k;
+            assert(sh->SubroutineUniformRemapTable[element_loc] ==
+                   INACTIVE_UNIFORM_EXPLICIT_LOCATION);
+            sh->SubroutineUniformRemapTable[element_loc] = &uniforms[i];
+         }
+      }
+   }
+
+   /* reserve subroutine locations */
+   for (unsigned i = 0; i < num_uniforms; i++) {
+
+      if (!uniforms[i].type->is_subroutine())
+         continue;
+      const unsigned entries = MAX2(1, uniforms[i].array_elements);
+
+      if (uniforms[i].remap_location != UNMAPPED_UNIFORM_LOC)
+         continue;
+      for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
+         struct gl_shader *sh = prog->_LinkedShaders[j];
+         if (!sh)
+            continue;
+
+         if (!uniforms[i].subroutine[j].active)
+            continue;
+
+         sh->SubroutineUniformRemapTable =
+            reralloc(sh,
+                     sh->SubroutineUniformRemapTable,
+                     gl_uniform_storage *,
+                     sh->NumSubroutineUniformRemapTable + entries);
+
+         for (unsigned k = 0; k < entries; k++)
+            sh->SubroutineUniformRemapTable[sh->NumSubroutineUniformRemapTable + k] = &uniforms[i];
+         uniforms[i].remap_location = sh->NumSubroutineUniformRemapTable;
+         sh->NumSubroutineUniformRemapTable += entries;
+      }
+   }
+
 #ifndef NDEBUG
    for (unsigned i = 0; i < num_uniforms; i++) {
       assert(uniforms[i].storage != NULL || uniforms[i].builtin);
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 278a778797b..f7a7b8c4c5b 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -54,10 +54,16 @@ cross_validate_types_and_qualifiers(struct gl_shader_program *prog,
    /* Check that the types match between stages.
     */
    const glsl_type *type_to_match = input->type;
-   if (consumer_stage == MESA_SHADER_GEOMETRY) {
-      assert(type_to_match->is_array()); /* Enforced by ast_to_hir */
+
+   /* VS -> GS, VS -> TCS, VS -> TES, TES -> GS */
+   const bool extra_array_level = (producer_stage == MESA_SHADER_VERTEX &&
+                                   consumer_stage != MESA_SHADER_FRAGMENT) ||
+                                  consumer_stage == MESA_SHADER_GEOMETRY;
+   if (extra_array_level) {
+      assert(type_to_match->is_array());
       type_to_match = type_to_match->fields.array;
    }
+
    if (type_to_match != output->type) {
       /* There is a bit of a special case for gl_TexCoord.  This
        * built-in is unsized by default.  Applications that variable
@@ -116,6 +122,18 @@ cross_validate_types_and_qualifiers(struct gl_shader_program *prog,
       return;
    }
 
+   if (input->data.patch != output->data.patch) {
+      linker_error(prog,
+                   "%s shader output `%s' %s patch qualifier, "
+                   "but %s shader input %s patch qualifier\n",
+                   _mesa_shader_stage_to_string(producer_stage),
+                   output->name,
+                   (output->data.patch) ? "has" : "lacks",
+                   _mesa_shader_stage_to_string(consumer_stage),
+                   (input->data.patch) ? "has" : "lacks");
+      return;
+   }
+
    if (!prog->IsES && input->data.invariant != output->data.invariant) {
       linker_error(prog,
                    "%s shader output `%s' %s invariant qualifier, "
@@ -128,7 +146,17 @@ cross_validate_types_and_qualifiers(struct gl_shader_program *prog,
       return;
    }
 
-   if (input->data.interpolation != output->data.interpolation) {
+   /* GLSL >= 4.40 removes text requiring interpolation qualifiers
+    * to match cross stage, they must only match within the same stage.
+    *
+    * From page 84 (page 90 of the PDF) of the GLSL 4.40 spec:
+    *
+    *     "It is a link-time error if, within the same stage, the interpolation
+    *     qualifiers of variables of the same name do not match.
+    *
+    */
+   if (input->data.interpolation != output->data.interpolation &&
+       prog->Version < 440) {
       linker_error(prog,
                    "%s shader output `%s' specifies %s "
                    "interpolation qualifier, "
@@ -300,7 +328,7 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx,
 
    this->location = -1;
    this->orig_name = input;
-   this->is_clip_distance_mesa = false;
+   this->lowered_builtin_array_variable = none;
    this->skip_components = 0;
    this->next_buffer_separator = false;
    this->matched_candidate = NULL;
@@ -349,8 +377,15 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx,
     */
    if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].LowerClipDistance &&
        strcmp(this->var_name, "gl_ClipDistance") == 0) {
-      this->is_clip_distance_mesa = true;
+      this->lowered_builtin_array_variable = clip_distance;
    }
+
+   if (ctx->Const.LowerTessLevel &&
+       (strcmp(this->var_name, "gl_TessLevelOuter") == 0))
+      this->lowered_builtin_array_variable = tess_level_outer;
+   if (ctx->Const.LowerTessLevel &&
+       (strcmp(this->var_name, "gl_TessLevelInner") == 0))
+      this->lowered_builtin_array_variable = tess_level_inner;
 }
 
 
@@ -397,9 +432,22 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
          this->matched_candidate->type->fields.array->matrix_columns;
       const unsigned vector_elements =
          this->matched_candidate->type->fields.array->vector_elements;
-      unsigned actual_array_size = this->is_clip_distance_mesa ?
-         prog->LastClipDistanceArraySize :
-         this->matched_candidate->type->array_size();
+      unsigned actual_array_size;
+      switch (this->lowered_builtin_array_variable) {
+      case clip_distance:
+         actual_array_size = prog->LastClipDistanceArraySize;
+         break;
+      case tess_level_outer:
+         actual_array_size = 4;
+         break;
+      case tess_level_inner:
+         actual_array_size = 2;
+         break;
+      case none:
+      default:
+         actual_array_size = this->matched_candidate->type->array_size();
+         break;
+      }
 
       if (this->is_subscripted) {
          /* Check array bounds. */
@@ -410,7 +458,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
                          actual_array_size);
             return false;
          }
-         unsigned array_elem_size = this->is_clip_distance_mesa ?
+         unsigned array_elem_size = this->lowered_builtin_array_variable ?
             1 : vector_elements * matrix_cols;
          fine_location += array_elem_size * this->array_subscript;
          this->size = 1;
@@ -419,7 +467,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
       }
       this->vector_elements = vector_elements;
       this->matrix_columns = matrix_cols;
-      if (this->is_clip_distance_mesa)
+      if (this->lowered_builtin_array_variable)
          this->type = GL_FLOAT;
       else
          this->type = this->matched_candidate->type->fields.array->gl_type;
@@ -524,6 +572,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
       info->Outputs[info->NumOutputs].DstOffset = info->BufferStride[buffer];
       ++info->NumOutputs;
       info->BufferStride[buffer] += output_size;
+      info->BufferStream[buffer] = this->stream_id;
       num_components -= output_size;
       location++;
       location_frac = 0;
@@ -542,8 +591,21 @@ const tfeedback_candidate *
 tfeedback_decl::find_candidate(gl_shader_program *prog,
                                hash_table *tfeedback_candidates)
 {
-   const char *name = this->is_clip_distance_mesa
-      ? "gl_ClipDistanceMESA" : this->var_name;
+   const char *name = this->var_name;
+   switch (this->lowered_builtin_array_variable) {
+   case none:
+      name = this->var_name;
+      break;
+   case clip_distance:
+      name = "gl_ClipDistanceMESA";
+      break;
+   case tess_level_outer:
+      name = "gl_TessLevelOuterMESA";
+      break;
+   case tess_level_inner:
+      name = "gl_TessLevelInnerMESA";
+      break;
+   }
    this->matched_candidate = (const tfeedback_candidate *)
       hash_table_find(tfeedback_candidates, name);
    if (!this->matched_candidate) {
@@ -699,7 +761,9 @@ namespace {
 class varying_matches
 {
 public:
-   varying_matches(bool disable_varying_packing, bool consumer_is_fs);
+   varying_matches(bool disable_varying_packing,
+                   gl_shader_stage producer_stage,
+                   gl_shader_stage consumer_stage);
    ~varying_matches();
    void record(ir_variable *producer_var, ir_variable *consumer_var);
    unsigned assign_locations();
@@ -780,15 +844,18 @@ private:
     */
    unsigned matches_capacity;
 
-   const bool consumer_is_fs;
+   gl_shader_stage producer_stage;
+   gl_shader_stage consumer_stage;
 };
 
 } /* anonymous namespace */
 
 varying_matches::varying_matches(bool disable_varying_packing,
-                                 bool consumer_is_fs)
+                                 gl_shader_stage producer_stage,
+                                 gl_shader_stage consumer_stage)
    : disable_varying_packing(disable_varying_packing),
-     consumer_is_fs(consumer_is_fs)
+     producer_stage(producer_stage),
+     consumer_stage(consumer_stage)
 {
    /* Note: this initial capacity is rather arbitrarily chosen to be large
     * enough for many cases without wasting an unreasonable amount of space.
@@ -839,7 +906,7 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
    }
 
    if ((consumer_var == NULL && producer_var->type->contains_integer()) ||
-       !consumer_is_fs) {
+       consumer_stage != MESA_SHADER_FRAGMENT) {
       /* Since this varying is not being consumed by the fragment shader, its
        * interpolation type varying cannot possibly affect rendering.  Also,
        * this variable is non-flat and is (or contains) an integer.
@@ -876,9 +943,22 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
    this->matches[this->num_matches].packing_order
       = this->compute_packing_order(var);
    if (this->disable_varying_packing) {
-      unsigned slots = var->type->is_array()
-         ? (var->type->length * var->type->fields.array->matrix_columns)
-         : var->type->matrix_columns;
+      const struct glsl_type *type = var->type;
+      unsigned slots;
+
+      /* Some shader stages have 2-dimensional varyings. Use the inner type. */
+      if (!var->data.patch &&
+          ((var == producer_var && producer_stage == MESA_SHADER_TESS_CTRL) ||
+           (var == consumer_var && (consumer_stage == MESA_SHADER_TESS_CTRL ||
+                                    consumer_stage == MESA_SHADER_TESS_EVAL ||
+                                    consumer_stage == MESA_SHADER_GEOMETRY)))) {
+         assert(type->is_array());
+         type = type->fields.array;
+      }
+
+      slots = (type->is_array()
+            ? (type->length * type->fields.array->matrix_columns)
+            : type->matrix_columns);
       this->matches[this->num_matches].num_components = 4 * slots;
    } else {
       this->matches[this->num_matches].num_components
@@ -906,8 +986,17 @@ varying_matches::assign_locations()
          &varying_matches::match_comparator);
 
    unsigned generic_location = 0;
+   unsigned generic_patch_location = MAX_VARYING*4;
 
    for (unsigned i = 0; i < this->num_matches; i++) {
+      unsigned *location = &generic_location;
+
+      if ((this->matches[i].consumer_var &&
+           this->matches[i].consumer_var->data.patch) ||
+          (this->matches[i].producer_var &&
+           this->matches[i].producer_var->data.patch))
+         location = &generic_patch_location;
+
       /* Advance to the next slot if this varying has a different packing
        * class than the previous one, and we're not already on a slot
        * boundary.
@@ -915,12 +1004,12 @@ varying_matches::assign_locations()
       if (i > 0 &&
           this->matches[i - 1].packing_class
           != this->matches[i].packing_class) {
-         generic_location = ALIGN(generic_location, 4);
+         *location = ALIGN(*location, 4);
       }
 
-      this->matches[i].generic_location = generic_location;
+      this->matches[i].generic_location = *location;
 
-      generic_location += this->matches[i].num_components;
+      *location += this->matches[i].num_components;
    }
 
    return (generic_location + 3) / 4;
@@ -979,7 +1068,8 @@ varying_matches::compute_packing_class(const ir_variable *var)
     *
     * Therefore, the packing class depends only on the interpolation type.
     */
-   unsigned packing_class = var->data.centroid | (var->data.sample << 1);
+   unsigned packing_class = var->data.centroid | (var->data.sample << 1) |
+                            (var->data.patch << 2);
    packing_class *= 4;
    packing_class += var->data.interpolation;
    return packing_class;
@@ -1133,11 +1223,11 @@ bool
 populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
                              hash_table *consumer_inputs,
                              hash_table *consumer_interface_inputs,
-                             ir_variable *consumer_inputs_with_locations[VARYING_SLOT_MAX])
+                             ir_variable *consumer_inputs_with_locations[VARYING_SLOT_TESS_MAX])
 {
    memset(consumer_inputs_with_locations,
           0,
-          sizeof(consumer_inputs_with_locations[0]) * VARYING_SLOT_MAX);
+          sizeof(consumer_inputs_with_locations[0]) * VARYING_SLOT_TESS_MAX);
 
    foreach_in_list(ir_instruction, node, ir) {
       ir_variable *const input_var = node->as_variable();
@@ -1193,7 +1283,7 @@ get_matching_input(void *mem_ctx,
                    const ir_variable *output_var,
                    hash_table *consumer_inputs,
                    hash_table *consumer_interface_inputs,
-                   ir_variable *consumer_inputs_with_locations[VARYING_SLOT_MAX])
+                   ir_variable *consumer_inputs_with_locations[VARYING_SLOT_TESS_MAX])
 {
    ir_variable *input_var;
 
@@ -1294,9 +1384,6 @@ canonicalize_shader_io(exec_list *ir, enum ir_variable_mode io_mode)
  *        each of these objects that matches one of the outputs of the
  *        producer.
  *
- * \param gs_input_vertices: if \c consumer is a geometry shader, this is the
- *        number of input vertices it accepts.  Otherwise zero.
- *
  * When num_tfeedback_decls is nonzero, it is permissible for the consumer to
  * be NULL.  In this case, varying locations are assigned solely based on the
  * requirements of transform feedback.
@@ -1307,21 +1394,44 @@ assign_varying_locations(struct gl_context *ctx,
 			 struct gl_shader_program *prog,
 			 gl_shader *producer, gl_shader *consumer,
                          unsigned num_tfeedback_decls,
-                         tfeedback_decl *tfeedback_decls,
-                         unsigned gs_input_vertices)
+                         tfeedback_decl *tfeedback_decls)
 {
-   varying_matches matches(ctx->Const.DisableVaryingPacking,
-                           consumer && consumer->Stage == MESA_SHADER_FRAGMENT);
+   if (ctx->Const.DisableVaryingPacking) {
+      /* Transform feedback code assumes varyings are packed, so if the driver
+       * has disabled varying packing, make sure it does not support transform
+       * feedback.
+       */
+      assert(!ctx->Extensions.EXT_transform_feedback);
+   }
+
+   /* Tessellation shaders treat inputs and outputs as shared memory and can
+    * access inputs and outputs of other invocations.
+    * Therefore, they can't be lowered to temps easily (and definitely not
+    * efficiently).
+    */
+   bool disable_varying_packing =
+      ctx->Const.DisableVaryingPacking ||
+      (consumer && consumer->Stage == MESA_SHADER_TESS_EVAL) ||
+      (consumer && consumer->Stage == MESA_SHADER_TESS_CTRL) ||
+      (producer && producer->Stage == MESA_SHADER_TESS_CTRL);
+
+   varying_matches matches(disable_varying_packing,
+                           producer ? producer->Stage : (gl_shader_stage)-1,
+                           consumer ? consumer->Stage : (gl_shader_stage)-1);
    hash_table *tfeedback_candidates
       = hash_table_ctor(0, hash_table_string_hash, hash_table_string_compare);
    hash_table *consumer_inputs
       = hash_table_ctor(0, hash_table_string_hash, hash_table_string_compare);
    hash_table *consumer_interface_inputs
       = hash_table_ctor(0, hash_table_string_hash, hash_table_string_compare);
-   ir_variable *consumer_inputs_with_locations[VARYING_SLOT_MAX] = {
+   ir_variable *consumer_inputs_with_locations[VARYING_SLOT_TESS_MAX] = {
       NULL,
    };
 
+   unsigned consumer_vertices = 0;
+   if (consumer && consumer->Stage == MESA_SHADER_GEOMETRY)
+      consumer_vertices = prog->Geom.VerticesIn;
+
    /* Operate in a total of four passes.
     *
     * 1. Sort inputs / outputs into a canonical order.  This is necessary so
@@ -1380,8 +1490,12 @@ assign_varying_locations(struct gl_context *ctx,
          /* If a matching input variable was found, add this ouptut (and the
           * input) to the set.  If this is a separable program and there is no
           * consumer stage, add the output.
+          *
+          * Always add TCS outputs. They are shared by all invocations
+          * within a patch and can be used as shared memory.
           */
-         if (input_var || (prog->SeparateShader && consumer == NULL)) {
+         if (input_var || (prog->SeparateShader && consumer == NULL) ||
+             producer->Type == GL_TESS_CONTROL_SHADER) {
             matches.record(output_var, input_var);
          }
 
@@ -1448,20 +1562,14 @@ assign_varying_locations(struct gl_context *ctx,
    hash_table_dtor(consumer_inputs);
    hash_table_dtor(consumer_interface_inputs);
 
-   if (ctx->Const.DisableVaryingPacking) {
-      /* Transform feedback code assumes varyings are packed, so if the driver
-       * has disabled varying packing, make sure it does not support transform
-       * feedback.
-       */
-      assert(!ctx->Extensions.EXT_transform_feedback);
-   } else {
+   if (!disable_varying_packing) {
       if (producer) {
          lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
                                0, producer);
       }
       if (consumer) {
          lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
-                               gs_input_vertices, consumer);
+                               consumer_vertices, consumer);
       }
    }
 
diff --git a/src/glsl/link_varyings.h b/src/glsl/link_varyings.h
index afc16a8baa7..2ce72d43d84 100644
--- a/src/glsl/link_varyings.h
+++ b/src/glsl/link_varyings.h
@@ -128,7 +128,7 @@ public:
     */
    unsigned num_components() const
    {
-      if (this->is_clip_distance_mesa)
+      if (this->lowered_builtin_array_variable)
          return this->size;
       else
          return this->vector_elements * this->matrix_columns * this->size;
@@ -161,10 +161,15 @@ private:
    unsigned array_subscript;
 
    /**
-    * True if the variable is gl_ClipDistance and the driver lowers
-    * gl_ClipDistance to gl_ClipDistanceMESA.
+    * Non-zero if the variable is gl_ClipDistance, glTessLevelOuter or
+    * gl_TessLevelInner and the driver lowers it to gl_*MESA.
     */
-   bool is_clip_distance_mesa;
+   enum {
+      none,
+      clip_distance,
+      tess_level_outer,
+      tess_level_inner,
+   } lowered_builtin_array_variable;
 
    /**
     * The vertex shader output location that the linker assigned for this
@@ -250,8 +255,7 @@ assign_varying_locations(struct gl_context *ctx,
 			 struct gl_shader_program *prog,
 			 gl_shader *producer, gl_shader *consumer,
                          unsigned num_tfeedback_decls,
-                         tfeedback_decl *tfeedback_decls,
-                         unsigned gs_input_vertices);
+                         tfeedback_decl *tfeedback_decls);
 
 bool
 check_against_output_limit(struct gl_context *ctx,
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 4a726d4e2e7..a7cd82049bd 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -250,6 +250,144 @@ public:
    }
 };
 
+class tess_eval_array_resize_visitor : public ir_hierarchical_visitor {
+public:
+   unsigned num_vertices;
+   gl_shader_program *prog;
+
+   tess_eval_array_resize_visitor(unsigned num_vertices, gl_shader_program *prog)
+   {
+      this->num_vertices = num_vertices;
+      this->prog = prog;
+   }
+
+   virtual ~tess_eval_array_resize_visitor()
+   {
+      /* empty */
+   }
+
+   virtual ir_visitor_status visit(ir_variable *var)
+   {
+      if (!var->type->is_array() || var->data.mode != ir_var_shader_in || var->data.patch)
+         return visit_continue;
+
+      var->type = glsl_type::get_array_instance(var->type->fields.array,
+                                                this->num_vertices);
+      var->data.max_array_access = this->num_vertices - 1;
+
+      return visit_continue;
+   }
+
+   /* Dereferences of input variables need to be updated so that their type
+    * matches the newly assigned type of the variable they are accessing. */
+   virtual ir_visitor_status visit(ir_dereference_variable *ir)
+   {
+      ir->type = ir->var->type;
+      return visit_continue;
+   }
+
+   /* Dereferences of 2D input arrays need to be updated so that their type
+    * matches the newly assigned type of the array they are accessing. */
+   virtual ir_visitor_status visit_leave(ir_dereference_array *ir)
+   {
+      const glsl_type *const vt = ir->array->type;
+      if (vt->is_array())
+         ir->type = vt->fields.array;
+      return visit_continue;
+   }
+};
+
+class barrier_use_visitor : public ir_hierarchical_visitor {
+public:
+   barrier_use_visitor(gl_shader_program *prog)
+      : prog(prog), in_main(false), after_return(false), control_flow(0)
+   {
+   }
+
+   virtual ~barrier_use_visitor()
+   {
+      /* empty */
+   }
+
+   virtual ir_visitor_status visit_enter(ir_function *ir)
+   {
+      if (strcmp(ir->name, "main") == 0)
+         in_main = true;
+
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_function *ir)
+   {
+      in_main = false;
+      after_return = false;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_return *ir)
+   {
+      after_return = true;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_enter(ir_if *ir)
+   {
+      ++control_flow;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_if *ir)
+   {
+      --control_flow;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_enter(ir_loop *ir)
+   {
+      ++control_flow;
+      return visit_continue;
+   }
+
+   virtual ir_visitor_status visit_leave(ir_loop *ir)
+   {
+      --control_flow;
+      return visit_continue;
+   }
+
+   /* FINISHME: `switch` is not expressed at the IR level -- it's already
+    * been lowered to a mess of `if`s. We'll correctly disallow any use of
+    * barrier() in a conditional path within the switch, but not in a path
+    * which is always hit.
+    */
+
+   virtual ir_visitor_status visit_enter(ir_call *ir)
+   {
+      if (ir->use_builtin && strcmp(ir->callee_name(), "barrier") == 0) {
+         /* Use of barrier(); determine if it is legal: */
+         if (!in_main) {
+            linker_error(prog, "Builtin barrier() may only be used in main");
+            return visit_stop;
+         }
+
+         if (after_return) {
+            linker_error(prog, "Builtin barrier() may not be used after return");
+            return visit_stop;
+         }
+
+         if (control_flow != 0) {
+            linker_error(prog, "Builtin barrier() may not be used inside control flow");
+            return visit_stop;
+         }
+      }
+      return visit_continue;
+   }
+
+private:
+   gl_shader_program *prog;
+   bool in_main, after_return;
+   int control_flow;
+};
+
 /**
  * Visitor that determines the highest stream id to which a (geometry) shader
  * emits vertices. It also checks whether End{Stream}Primitive is ever called.
@@ -346,6 +484,39 @@ private:
    bool uses_non_zero_stream;
 };
 
+/* Class that finds array derefs and check if indexes are dynamic. */
+class dynamic_sampler_array_indexing_visitor : public ir_hierarchical_visitor
+{
+public:
+   dynamic_sampler_array_indexing_visitor() :
+      dynamic_sampler_array_indexing(false)
+   {
+   }
+
+   ir_visitor_status visit_enter(ir_dereference_array *ir)
+   {
+      if (!ir->variable_referenced())
+         return visit_continue;
+
+      if (!ir->variable_referenced()->type->contains_sampler())
+         return visit_continue;
+
+      if (!ir->array_index->constant_expression_value()) {
+         dynamic_sampler_array_indexing = true;
+         return visit_stop;
+      }
+      return visit_continue;
+   }
+
+   bool uses_dynamic_sampler_array_indexing()
+   {
+      return dynamic_sampler_array_indexing;
+   }
+
+private:
+   bool dynamic_sampler_array_indexing;
+};
+
 } /* anonymous namespace */
 
 void
@@ -429,6 +600,10 @@ parse_program_resource_name(const GLchar *name,
    if (array_index < 0)
       return -1;
 
+   /* Check for leading zero */
+   if (name[i] == '0' && name[i+1] != ']')
+      return -1;
+
    *out_base_name_end = name + (i - 1);
    return array_index;
 }
@@ -582,6 +757,17 @@ validate_vertex_shader_executable(struct gl_shader_program *prog,
                       &prog->Vert.ClipDistanceArraySize);
 }
 
+void
+validate_tess_eval_shader_executable(struct gl_shader_program *prog,
+                                     struct gl_shader *shader)
+{
+   if (shader == NULL)
+      return;
+
+   analyze_clip_usage(prog, shader, &prog->TessEval.UsesClipDistance,
+                      &prog->TessEval.ClipDistanceArraySize);
+}
+
 
 /**
  * Verify that a fragment shader executable meets all semantic requirements
@@ -744,9 +930,13 @@ cross_validate_globals(struct gl_shader_program *prog,
 	 if (var == NULL)
 	    continue;
 
-	 if (uniforms_only && (var->data.mode != ir_var_uniform))
+	 if (uniforms_only && (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage))
 	    continue;
 
+         /* don't cross validate subroutine uniforms */
+         if (var->type->contains_subroutine())
+            continue;
+
 	 /* Don't cross validate temporaries that are at global scope.  These
 	  * will eventually get pulled into the shaders 'main'.
 	  */
@@ -1217,8 +1407,7 @@ public:
                resize_interface_members(var->type->fields.array,
                                         var->get_max_ifc_array_access());
             var->change_interface_type(new_type);
-            var->type =
-               glsl_type::get_array_instance(new_type, var->type->length);
+            var->type = update_interface_members_array(var->type, new_type);
          }
       } else if (const glsl_type *ifc_type = var->get_interface_type()) {
          /* Store a pointer to the variable in the unnamed_interfaces
@@ -1266,6 +1455,21 @@ private:
       }
    }
 
+   static const glsl_type *
+   update_interface_members_array(const glsl_type *type,
+                                  const glsl_type *new_interface_type)
+   {
+      const glsl_type *element_type = type->fields.array;
+      if (element_type->is_array()) {
+         const glsl_type *new_array_type =
+            update_interface_members_array(element_type, new_interface_type);
+         return glsl_type::get_array_instance(new_array_type, type->length);
+      } else {
+         return glsl_type::get_array_instance(new_interface_type,
+                                              type->length);
+      }
+   }
+
    /**
     * Determine whether the given interface type contains unsized arrays (if
     * it doesn't, array_sizing_visitor doesn't need to process it).
@@ -1350,6 +1554,167 @@ private:
    hash_table *unnamed_interfaces;
 };
 
+
+/**
+ * Performs the cross-validation of tessellation control shader vertices and
+ * layout qualifiers for the attached tessellation control shaders,
+ * and propagates them to the linked TCS and linked shader program.
+ */
+static void
+link_tcs_out_layout_qualifiers(struct gl_shader_program *prog,
+			      struct gl_shader *linked_shader,
+			      struct gl_shader **shader_list,
+			      unsigned num_shaders)
+{
+   linked_shader->TessCtrl.VerticesOut = 0;
+
+   if (linked_shader->Stage != MESA_SHADER_TESS_CTRL)
+      return;
+
+   /* From the GLSL 4.0 spec (chapter 4.3.8.2):
+    *
+    *     "All tessellation control shader layout declarations in a program
+    *      must specify the same output patch vertex count.  There must be at
+    *      least one layout qualifier specifying an output patch vertex count
+    *      in any program containing tessellation control shaders; however,
+    *      such a declaration is not required in all tessellation control
+    *      shaders."
+    */
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+      struct gl_shader *shader = shader_list[i];
+
+      if (shader->TessCtrl.VerticesOut != 0) {
+	 if (linked_shader->TessCtrl.VerticesOut != 0 &&
+	     linked_shader->TessCtrl.VerticesOut != shader->TessCtrl.VerticesOut) {
+	    linker_error(prog, "tessellation control shader defined with "
+			 "conflicting output vertex count (%d and %d)\n",
+			 linked_shader->TessCtrl.VerticesOut,
+			 shader->TessCtrl.VerticesOut);
+	    return;
+	 }
+	 linked_shader->TessCtrl.VerticesOut = shader->TessCtrl.VerticesOut;
+      }
+   }
+
+   /* Just do the intrastage -> interstage propagation right now,
+    * since we already know we're in the right type of shader program
+    * for doing it.
+    */
+   if (linked_shader->TessCtrl.VerticesOut == 0) {
+      linker_error(prog, "tessellation control shader didn't declare "
+		   "vertices out layout qualifier\n");
+      return;
+   }
+   prog->TessCtrl.VerticesOut = linked_shader->TessCtrl.VerticesOut;
+}
+
+
+/**
+ * Performs the cross-validation of tessellation evaluation shader
+ * primitive type, vertex spacing, ordering and point_mode layout qualifiers
+ * for the attached tessellation evaluation shaders, and propagates them
+ * to the linked TES and linked shader program.
+ */
+static void
+link_tes_in_layout_qualifiers(struct gl_shader_program *prog,
+				struct gl_shader *linked_shader,
+				struct gl_shader **shader_list,
+				unsigned num_shaders)
+{
+   linked_shader->TessEval.PrimitiveMode = PRIM_UNKNOWN;
+   linked_shader->TessEval.Spacing = 0;
+   linked_shader->TessEval.VertexOrder = 0;
+   linked_shader->TessEval.PointMode = -1;
+
+   if (linked_shader->Stage != MESA_SHADER_TESS_EVAL)
+      return;
+
+   /* From the GLSL 4.0 spec (chapter 4.3.8.1):
+    *
+    *     "At least one tessellation evaluation shader (compilation unit) in
+    *      a program must declare a primitive mode in its input layout.
+    *      Declaration vertex spacing, ordering, and point mode identifiers is
+    *      optional.  It is not required that all tessellation evaluation
+    *      shaders in a program declare a primitive mode.  If spacing or
+    *      vertex ordering declarations are omitted, the tessellation
+    *      primitive generator will use equal spacing or counter-clockwise
+    *      vertex ordering, respectively.  If a point mode declaration is
+    *      omitted, the tessellation primitive generator will produce lines or
+    *      triangles according to the primitive mode."
+    */
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+      struct gl_shader *shader = shader_list[i];
+
+      if (shader->TessEval.PrimitiveMode != PRIM_UNKNOWN) {
+	 if (linked_shader->TessEval.PrimitiveMode != PRIM_UNKNOWN &&
+	     linked_shader->TessEval.PrimitiveMode != shader->TessEval.PrimitiveMode) {
+	    linker_error(prog, "tessellation evaluation shader defined with "
+			 "conflicting input primitive modes.\n");
+	    return;
+	 }
+	 linked_shader->TessEval.PrimitiveMode = shader->TessEval.PrimitiveMode;
+      }
+
+      if (shader->TessEval.Spacing != 0) {
+	 if (linked_shader->TessEval.Spacing != 0 &&
+	     linked_shader->TessEval.Spacing != shader->TessEval.Spacing) {
+	    linker_error(prog, "tessellation evaluation shader defined with "
+			 "conflicting vertex spacing.\n");
+	    return;
+	 }
+	 linked_shader->TessEval.Spacing = shader->TessEval.Spacing;
+      }
+
+      if (shader->TessEval.VertexOrder != 0) {
+	 if (linked_shader->TessEval.VertexOrder != 0 &&
+	     linked_shader->TessEval.VertexOrder != shader->TessEval.VertexOrder) {
+	    linker_error(prog, "tessellation evaluation shader defined with "
+			 "conflicting ordering.\n");
+	    return;
+	 }
+	 linked_shader->TessEval.VertexOrder = shader->TessEval.VertexOrder;
+      }
+
+      if (shader->TessEval.PointMode != -1) {
+	 if (linked_shader->TessEval.PointMode != -1 &&
+	     linked_shader->TessEval.PointMode != shader->TessEval.PointMode) {
+	    linker_error(prog, "tessellation evaluation shader defined with "
+			 "conflicting point modes.\n");
+	    return;
+	 }
+	 linked_shader->TessEval.PointMode = shader->TessEval.PointMode;
+      }
+
+   }
+
+   /* Just do the intrastage -> interstage propagation right now,
+    * since we already know we're in the right type of shader program
+    * for doing it.
+    */
+   if (linked_shader->TessEval.PrimitiveMode == PRIM_UNKNOWN) {
+      linker_error(prog,
+		   "tessellation evaluation shader didn't declare input "
+		   "primitive modes.\n");
+      return;
+   }
+   prog->TessEval.PrimitiveMode = linked_shader->TessEval.PrimitiveMode;
+
+   if (linked_shader->TessEval.Spacing == 0)
+      linked_shader->TessEval.Spacing = GL_EQUAL;
+   prog->TessEval.Spacing = linked_shader->TessEval.Spacing;
+
+   if (linked_shader->TessEval.VertexOrder == 0)
+      linked_shader->TessEval.VertexOrder = GL_CCW;
+   prog->TessEval.VertexOrder = linked_shader->TessEval.VertexOrder;
+
+   if (linked_shader->TessEval.PointMode == -1)
+      linked_shader->TessEval.PointMode = GL_FALSE;
+   prog->TessEval.PointMode = linked_shader->TessEval.PointMode;
+}
+
+
 /**
  * Performs the cross-validation of layout qualifiers specified in
  * redeclaration of gl_FragCoord for the attached fragment shaders,
@@ -1696,6 +2061,8 @@ link_intrastage_shaders(void *mem_ctx,
    ralloc_steal(linked, linked->UniformBlocks);
 
    link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
+   link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders);
+   link_tes_in_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_gs_inout_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_cs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
 
@@ -1778,6 +2145,14 @@ link_intrastage_shaders(void *mem_ctx,
    if (ctx->Const.VertexID_is_zero_based)
       lower_vertex_id(linked);
 
+   /* Validate correct usage of barrier() in the tess control shader */
+   if (linked->Stage == MESA_SHADER_TESS_CTRL) {
+      barrier_use_visitor visitor(prog);
+      foreach_in_list(ir_instruction, ir, linked->ir) {
+         ir->accept(&visitor);
+      }
+   }
+
    /* Make a pass over all variable declarations to ensure that arrays with
     * unspecified sizes have a size specified.  The size is inferred from the
     * max_array_access field.
@@ -1825,8 +2200,11 @@ update_array_sizes(struct gl_shader_program *prog)
           * Atomic counters are supposed to get deterministic
           * locations assigned based on the declaration ordering and
           * sizes, array compaction would mess that up.
+          *
+          * Subroutine uniforms are not removed.
 	  */
-	 if (var->is_in_uniform_block() || var->type->contains_atomic())
+	 if (var->is_in_buffer_block() || var->type->contains_atomic() ||
+	     var->type->contains_subroutine())
 	    continue;
 
 	 unsigned int size = var->data.max_array_access;
@@ -1871,6 +2249,34 @@ update_array_sizes(struct gl_shader_program *prog)
    }
 }
 
+/**
+ * Resize tessellation evaluation per-vertex inputs to the size of
+ * tessellation control per-vertex outputs.
+ */
+static void
+resize_tes_inputs(struct gl_context *ctx,
+                  struct gl_shader_program *prog)
+{
+   if (prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] == NULL)
+      return;
+
+   gl_shader *const tcs = prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
+   gl_shader *const tes = prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
+
+   /* If no control shader is present, then the TES inputs are statically
+    * sized to MaxPatchVertices; the actual size of the arrays won't be
+    * known until draw time.
+    */
+   const int num_vertices = tcs
+      ? tcs->TessCtrl.VerticesOut
+      : ctx->Const.MaxPatchVertices;
+
+   tess_eval_array_resize_visitor input_resize_visitor(num_vertices, prog);
+   foreach_in_list(ir_instruction, ir, tes->ir) {
+      ir->accept(&input_resize_visitor);
+   }
+}
+
 /**
  * Find a contiguous set of available bits in a bitmask.
  *
@@ -1907,12 +2313,10 @@ find_available_slots(unsigned used_mask, unsigned needed_count)
  * Assign locations for either VS inputs or FS outputs
  *
  * \param prog          Shader program whose variables need locations assigned
+ * \param constants     Driver specific constant values for the program.
  * \param target_index  Selector for the program target to receive location
  *                      assignmnets.  Must be either \c MESA_SHADER_VERTEX or
  *                      \c MESA_SHADER_FRAGMENT.
- * \param max_index     Maximum number of generic locations.  This corresponds
- *                      to either the maximum number of draw buffers or the
- *                      maximum number of generic attributes.
  *
  * \return
  * If locations are successfully assigned, true is returned.  Otherwise an
@@ -1920,9 +2324,17 @@ find_available_slots(unsigned used_mask, unsigned needed_count)
  */
 bool
 assign_attribute_or_color_locations(gl_shader_program *prog,
-				    unsigned target_index,
-				    unsigned max_index)
+                                    struct gl_constants *constants,
+                                    unsigned target_index)
 {
+   /* Maximum number of generic locations.  This corresponds to either the
+    * maximum number of draw buffers or the maximum number of generic
+    * attributes.
+    */
+   unsigned max_index = (target_index == MESA_SHADER_VERTEX) ?
+      constants->Program[target_index].MaxAttribs :
+      MAX2(constants->MaxDrawBuffers, constants->MaxDualSourceDrawBuffers);
+
    /* Mark invalid locations as being used.
     */
    unsigned used_locations = (max_index >= 32)
@@ -2019,6 +2431,25 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
 	 }
       }
 
+      /* From GL4.5 core spec, section 15.2 (Shader Execution):
+       *
+       *     "Output binding assignments will cause LinkProgram to fail:
+       *     ...
+       *     If the program has an active output assigned to a location greater
+       *     than or equal to the value of MAX_DUAL_SOURCE_DRAW_BUFFERS and has
+       *     an active output assigned an index greater than or equal to one;"
+       */
+      if (target_index == MESA_SHADER_FRAGMENT && var->data.index >= 1 &&
+          var->data.location - generic_base >=
+          (int) constants->MaxDualSourceDrawBuffers) {
+         linker_error(prog,
+                      "output location %d >= GL_MAX_DUAL_SOURCE_DRAW_BUFFERS "
+                      "with index %u for %s\n",
+                      var->data.location - generic_base, var->data.index,
+                      var->name);
+         return false;
+      }
+
       const unsigned slots = var->type->count_attribute_slots();
 
       /* From GL4.5 core spec, section 11.1.1 (Vertex Attributes):
@@ -2389,6 +2820,49 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
    }
 }
 
+static void
+link_calculate_subroutine_compat(struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_shader *sh = prog->_LinkedShaders[i];
+      int count;
+      if (!sh)
+         continue;
+
+      for (unsigned j = 0; j < sh->NumSubroutineUniformRemapTable; j++) {
+         struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[j];
+
+         if (!uni)
+            continue;
+
+         count = 0;
+         for (unsigned f = 0; f < sh->NumSubroutineFunctions; f++) {
+            struct gl_subroutine_function *fn = &sh->SubroutineFunctions[f];
+            for (int k = 0; k < fn->num_compat_types; k++) {
+               if (fn->types[k] == uni->type) {
+                  count++;
+                  break;
+               }
+            }
+         }
+         uni->num_compatible_subroutines = count;
+      }
+   }
+}
+
+static void
+check_subroutine_resources(struct gl_context *ctx, struct gl_shader_program *prog)
+{
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_shader *sh = prog->_LinkedShaders[i];
+
+      if (sh) {
+         if (sh->NumSubroutineUniformRemapTable > MAX_SUBROUTINE_UNIFORM_LOCATIONS)
+            linker_error(prog, "Too many %s shader subroutine uniforms\n",
+                         _mesa_shader_stage_to_string(i));
+      }
+   }
+}
 /**
  * Validate shader image resources.
  */
@@ -2406,8 +2880,9 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 
       if (sh) {
          if (sh->NumImages > ctx->Const.Program[i].MaxImageUniforms)
-            linker_error(prog, "Too many %s shader image uniforms\n",
-                         _mesa_shader_stage_to_string(i));
+            linker_error(prog, "Too many %s shader image uniforms (%u > %u)\n",
+                         _mesa_shader_stage_to_string(i), sh->NumImages,
+                         ctx->Const.Program[i].MaxImageUniforms);
 
          total_image_units += sh->NumImages;
 
@@ -2497,6 +2972,59 @@ reserve_explicit_locations(struct gl_shader_program *prog,
    return true;
 }
 
+static bool
+reserve_subroutine_explicit_locations(struct gl_shader_program *prog,
+                                      struct gl_shader *sh,
+                                      ir_variable *var)
+{
+   unsigned slots = var->type->uniform_locations();
+   unsigned max_loc = var->data.location + slots - 1;
+
+   /* Resize remap table if locations do not fit in the current one. */
+   if (max_loc + 1 > sh->NumSubroutineUniformRemapTable) {
+      sh->SubroutineUniformRemapTable =
+         reralloc(sh, sh->SubroutineUniformRemapTable,
+                  gl_uniform_storage *,
+                  max_loc + 1);
+
+      if (!sh->SubroutineUniformRemapTable) {
+         linker_error(prog, "Out of memory during linking.\n");
+         return false;
+      }
+
+      /* Initialize allocated space. */
+      for (unsigned i = sh->NumSubroutineUniformRemapTable; i < max_loc + 1; i++)
+         sh->SubroutineUniformRemapTable[i] = NULL;
+
+      sh->NumSubroutineUniformRemapTable = max_loc + 1;
+   }
+
+   for (unsigned i = 0; i < slots; i++) {
+      unsigned loc = var->data.location + i;
+
+      /* Check if location is already used. */
+      if (sh->SubroutineUniformRemapTable[loc] == INACTIVE_UNIFORM_EXPLICIT_LOCATION) {
+
+         /* ARB_explicit_uniform_location specification states:
+          *     "No two subroutine uniform variables can have the same location
+          *     in the same shader stage, otherwise a compiler or linker error
+          *     will be generated."
+          */
+         linker_error(prog,
+                      "location qualifier for uniform %s overlaps "
+                      "previously used location\n",
+                      var->name);
+         return false;
+      }
+
+      /* Initialize location as inactive before optimization
+       * rounds and location assignment.
+       */
+      sh->SubroutineUniformRemapTable[loc] = INACTIVE_UNIFORM_EXPLICIT_LOCATION;
+   }
+
+   return true;
+}
 /**
  * Check and reserve all explicit uniform locations, called before
  * any optimizations happen to handle also inactive uniforms and
@@ -2527,9 +3055,14 @@ check_explicit_uniform_locations(struct gl_context *ctx,
 
       foreach_in_list(ir_instruction, node, sh->ir) {
          ir_variable *var = node->as_variable();
-         if ((var && var->data.mode == ir_var_uniform) &&
+         if (var && (var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage) &&
              var->data.explicit_location) {
-            if (!reserve_explicit_locations(prog, uniform_map, var)) {
+            bool ret;
+            if (var->type->is_subroutine())
+               ret = reserve_subroutine_explicit_locations(prog, sh, var);
+            else
+               ret = reserve_explicit_locations(prog, uniform_map, var);
+            if (!ret) {
                delete uniform_map;
                return;
             }
@@ -2578,7 +3111,8 @@ add_program_resource(struct gl_shader_program *prog, GLenum type,
  * Function builds a stage reference bitmask from variable name.
  */
 static uint8_t
-build_stageref(struct gl_shader_program *shProg, const char *name)
+build_stageref(struct gl_shader_program *shProg, const char *name,
+               unsigned mode)
 {
    uint8_t stages = 0;
 
@@ -2591,9 +3125,34 @@ build_stageref(struct gl_shader_program *shProg, const char *name)
       struct gl_shader *sh = shProg->_LinkedShaders[i];
       if (!sh)
          continue;
-      ir_variable *var = sh->symbols->get_variable(name);
-      if (var)
-         stages |= (1 << i);
+
+      /* Shader symbol table may contain variables that have
+       * been optimized away. Search IR for the variable instead.
+       */
+      foreach_in_list(ir_instruction, node, sh->ir) {
+         ir_variable *var = node->as_variable();
+         if (var) {
+            unsigned baselen = strlen(var->name);
+
+            /* Type needs to match if specified, otherwise we might
+             * pick a variable with same name but different interface.
+             */
+            if (var->data.mode != mode)
+               continue;
+
+            if (strncmp(var->name, name, baselen) == 0) {
+               /* Check for exact name matches but also check for arrays and
+                * structs.
+                */
+               if (name[baselen] == '\0' ||
+                   name[baselen] == '[' ||
+                   name[baselen] == '.') {
+                  stages |= (1 << i);
+                  break;
+               }
+            }
+         }
+      }
    }
    return stages;
 }
@@ -2638,7 +3197,8 @@ add_interface_variables(struct gl_shader_program *shProg,
       };
 
       if (!add_program_resource(shProg, programInterface, var,
-                                build_stageref(shProg, var->name) | mask))
+                                build_stageref(shProg, var->name,
+                                               var->data.mode) | mask))
          return false;
    }
    return true;
@@ -2648,7 +3208,7 @@ add_interface_variables(struct gl_shader_program *shProg,
  * Builds up a list of program resources that point to existing
  * resource data.
  */
-static void
+void
 build_program_resource_list(struct gl_context *ctx,
                             struct gl_shader_program *shProg)
 {
@@ -2689,12 +3249,9 @@ build_program_resource_list(struct gl_context *ctx,
    /* Add transform feedback varyings. */
    if (shProg->LinkedTransformFeedback.NumVarying > 0) {
       for (int i = 0; i < shProg->LinkedTransformFeedback.NumVarying; i++) {
-         uint8_t stageref =
-            build_stageref(shProg,
-                           shProg->LinkedTransformFeedback.Varyings[i].Name);
          if (!add_program_resource(shProg, GL_TRANSFORM_FEEDBACK_VARYING,
                                    &shProg->LinkedTransformFeedback.Varyings[i],
-                                   stageref))
+                                   0))
          return;
       }
    }
@@ -2706,7 +3263,8 @@ build_program_resource_list(struct gl_context *ctx,
          continue;
 
       uint8_t stageref =
-         build_stageref(shProg, shProg->UniformStorage[i].name);
+         build_stageref(shProg, shProg->UniformStorage[i].name,
+                        ir_var_uniform);
 
       /* Add stagereferences for uniforms in a uniform block. */
       int block_index = shProg->UniformStorage[i].block_index;
@@ -2736,13 +3294,111 @@ build_program_resource_list(struct gl_context *ctx,
          return;
    }
 
+   for (unsigned i = 0; i < shProg->NumUniformStorage; i++) {
+      GLenum type;
+      if (!shProg->UniformStorage[i].hidden)
+         continue;
+
+      for (int j = MESA_SHADER_VERTEX; j < MESA_SHADER_STAGES; j++) {
+         if (!shProg->UniformStorage[i].subroutine[j].active)
+            continue;
+
+         type = _mesa_shader_stage_to_subroutine_uniform((gl_shader_stage)j);
+         /* add shader subroutines */
+         if (!add_program_resource(shProg, type, &shProg->UniformStorage[i], 0))
+            return;
+      }
+   }
+
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_shader *sh = shProg->_LinkedShaders[i];
+      GLuint type;
+
+      if (!sh)
+         continue;
+
+      type = _mesa_shader_stage_to_subroutine((gl_shader_stage)i);
+      for (unsigned j = 0; j < sh->NumSubroutineFunctions; j++) {
+         if (!add_program_resource(shProg, type, &sh->SubroutineFunctions[j], 0))
+            return;
+      }
+   }
+
    /* TODO - following extensions will require more resource types:
     *
     *    GL_ARB_shader_storage_buffer_object
-    *    GL_ARB_shader_subroutine
     */
 }
 
+/**
+ * This check is done to make sure we allow only constant expression
+ * indexing and "constant-index-expression" (indexing with an expression
+ * that includes loop induction variable).
+ */
+static bool
+validate_sampler_array_indexing(struct gl_context *ctx,
+                                struct gl_shader_program *prog)
+{
+   dynamic_sampler_array_indexing_visitor v;
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (prog->_LinkedShaders[i] == NULL)
+	 continue;
+
+      bool no_dynamic_indexing =
+         ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler;
+
+      /* Search for array derefs in shader. */
+      v.run(prog->_LinkedShaders[i]->ir);
+      if (v.uses_dynamic_sampler_array_indexing()) {
+         const char *msg = "sampler arrays indexed with non-constant "
+                           "expressions is forbidden in GLSL %s %u";
+         /* Backend has indicated that it has no dynamic indexing support. */
+         if (no_dynamic_indexing) {
+            linker_error(prog, msg, prog->IsES ? "ES" : "", prog->Version);
+            return false;
+         } else {
+            linker_warning(prog, msg, prog->IsES ? "ES" : "", prog->Version);
+         }
+      }
+   }
+   return true;
+}
+
+void
+link_assign_subroutine_types(struct gl_context *ctx,
+                             struct gl_shader_program *prog)
+{
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      gl_shader *sh = prog->_LinkedShaders[i];
+
+      if (sh == NULL)
+         continue;
+
+      foreach_in_list(ir_instruction, node, sh->ir) {
+         ir_function *fn = node->as_function();
+         if (!fn)
+            continue;
+
+         if (fn->is_subroutine)
+            sh->NumSubroutineUniformTypes++;
+
+         if (!fn->num_subroutine_types)
+            continue;
+
+         sh->SubroutineFunctions = reralloc(sh, sh->SubroutineFunctions,
+                                            struct gl_subroutine_function,
+                                            sh->NumSubroutineFunctions + 1);
+         sh->SubroutineFunctions[sh->NumSubroutineFunctions].name = ralloc_strdup(sh, fn->name);
+         sh->SubroutineFunctions[sh->NumSubroutineFunctions].num_compat_types = fn->num_subroutine_types;
+         sh->SubroutineFunctions[sh->NumSubroutineFunctions].types =
+            ralloc_array(sh, const struct glsl_type *,
+                         fn->num_subroutine_types);
+         for (int j = 0; j < fn->num_subroutine_types; j++)
+            sh->SubroutineFunctions[sh->NumSubroutineFunctions].types[j] = fn->subroutine_types[j];
+         sh->NumSubroutineFunctions++;
+      }
+   }
+}
 
 void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
@@ -2804,7 +3460,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    prog->Version = max_version;
    prog->IsES = is_es_prog;
 
-   /* Geometry shaders have to be linked with vertex shaders.
+   /* Some shaders have to be linked with some other shaders present.
     */
    if (num_shaders[MESA_SHADER_GEOMETRY] > 0 &&
        num_shaders[MESA_SHADER_VERTEX] == 0 &&
@@ -2813,6 +3469,44 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 		   "vertex shader\n");
       goto done;
    }
+   if (num_shaders[MESA_SHADER_TESS_EVAL] > 0 &&
+       num_shaders[MESA_SHADER_VERTEX] == 0 &&
+       !prog->SeparateShader) {
+      linker_error(prog, "Tessellation evaluation shader must be linked with "
+		   "vertex shader\n");
+      goto done;
+   }
+   if (num_shaders[MESA_SHADER_TESS_CTRL] > 0 &&
+       num_shaders[MESA_SHADER_VERTEX] == 0 &&
+       !prog->SeparateShader) {
+      linker_error(prog, "Tessellation control shader must be linked with "
+		   "vertex shader\n");
+      goto done;
+   }
+
+   /* The spec is self-contradictory here. It allows linking without a tess
+    * eval shader, but that can only be used with transform feedback and
+    * rasterization disabled. However, transform feedback isn't allowed
+    * with GL_PATCHES, so it can't be used.
+    *
+    * More investigation showed that the idea of transform feedback after
+    * a tess control shader was dropped, because some hw vendors couldn't
+    * support tessellation without a tess eval shader, but the linker section
+    * wasn't updated to reflect that.
+    *
+    * All specifications (ARB_tessellation_shader, GL 4.0-4.5) have this
+    * spec bug.
+    *
+    * Do what's reasonable and always require a tess eval shader if a tess
+    * control shader is present.
+    */
+   if (num_shaders[MESA_SHADER_TESS_CTRL] > 0 &&
+       num_shaders[MESA_SHADER_TESS_EVAL] == 0 &&
+       !prog->SeparateShader) {
+      linker_error(prog, "Tessellation control shader must be linked with "
+		   "tessellation evaluation shader\n");
+      goto done;
+   }
 
    /* Compute shaders have additional restrictions. */
    if (num_shaders[MESA_SHADER_COMPUTE] > 0 &&
@@ -2846,6 +3540,12 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
          case MESA_SHADER_VERTEX:
             validate_vertex_shader_executable(prog, sh);
             break;
+         case MESA_SHADER_TESS_CTRL:
+            /* nothing to be done */
+            break;
+         case MESA_SHADER_TESS_EVAL:
+            validate_tess_eval_shader_executable(prog, sh);
+            break;
          case MESA_SHADER_GEOMETRY:
             validate_geometry_shader_executable(prog, sh);
             break;
@@ -2865,6 +3565,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 
    if (num_shaders[MESA_SHADER_GEOMETRY] > 0)
       prog->LastClipDistanceArraySize = prog->Geom.ClipDistanceArraySize;
+   else if (num_shaders[MESA_SHADER_TESS_EVAL] > 0)
+      prog->LastClipDistanceArraySize = prog->TessEval.ClipDistanceArraySize;
    else if (num_shaders[MESA_SHADER_VERTEX] > 0)
       prog->LastClipDistanceArraySize = prog->Vert.ClipDistanceArraySize;
    else
@@ -2886,9 +3588,13 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    }
 
    check_explicit_uniform_locations(ctx, prog);
+   link_assign_subroutine_types(ctx, prog);
+
    if (!prog->LinkStatus)
       goto done;
 
+   resize_tes_inputs(ctx, prog);
+
    /* Validate the inputs of each stage with the output of the preceding
     * stage.
     */
@@ -2953,6 +3659,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
          lower_clip_distance(prog->_LinkedShaders[i]);
       }
 
+      if (ctx->Const.LowerTessLevel) {
+         lower_tess_level(prog->_LinkedShaders[i]);
+      }
+
       while (do_common_optimization(prog->_LinkedShaders[i]->ir, true, false,
                                     &ctx->Const.ShaderCompilerOptions[i],
                                     ctx->Const.NativeIntegers))
@@ -2961,6 +3671,16 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       lower_const_arrays_to_uniforms(prog->_LinkedShaders[i]->ir);
    }
 
+   /* Validation for special cases where we allow sampler array indexing
+    * with loop induction variable. This check emits a warning or error
+    * depending if backend can handle dynamic indexing.
+    */
+   if ((!prog->IsES && prog->Version < 130) ||
+       (prog->IsES && prog->Version < 300)) {
+      if (!validate_sampler_array_indexing(ctx, prog))
+         goto done;
+   }
+
    /* Check and validate stream emissions in geometry shaders */
    validate_geometry_shader_emissions(ctx, prog);
 
@@ -2971,16 +3691,13 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
    }
 
-   /* FINISHME: The value of the max_attribute_index parameter is
-    * FINISHME: implementation dependent based on the value of
-    * FINISHME: GL_MAX_VERTEX_ATTRIBS.  GL_MAX_VERTEX_ATTRIBS must be
-    * FINISHME: at least 16, so hardcode 16 for now.
-    */
-   if (!assign_attribute_or_color_locations(prog, MESA_SHADER_VERTEX, 16)) {
+   if (!assign_attribute_or_color_locations(prog, &ctx->Const,
+                                            MESA_SHADER_VERTEX)) {
       goto done;
    }
 
-   if (!assign_attribute_or_color_locations(prog, MESA_SHADER_FRAGMENT, MAX2(ctx->Const.MaxDrawBuffers, ctx->Const.MaxDualSourceDrawBuffers))) {
+   if (!assign_attribute_or_color_locations(prog, &ctx->Const,
+                                            MESA_SHADER_FRAGMENT)) {
       goto done;
    }
 
@@ -3039,8 +3756,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
           */
          if (!assign_varying_locations(ctx, mem_ctx, prog,
                                        NULL, prog->_LinkedShaders[first],
-                                       num_tfeedback_decls, tfeedback_decls,
-                                       prog->Geom.VerticesIn))
+                                       num_tfeedback_decls, tfeedback_decls))
             goto done;
       }
 
@@ -3051,8 +3767,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
           */
          if (!assign_varying_locations(ctx, mem_ctx, prog,
                                        sh, NULL,
-                                       num_tfeedback_decls, tfeedback_decls,
-                                       0))
+                                       num_tfeedback_decls, tfeedback_decls))
             goto done;
       }
 
@@ -3080,8 +3795,7 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
                                        NULL /* producer */,
                                        sh /* consumer */,
                                        0 /* num_tfeedback_decls */,
-                                       NULL /* tfeedback_decls */,
-                                       0 /* gs_input_vertices */))
+                                       NULL /* tfeedback_decls */))
             goto done;
       } else
          demote_shader_inputs_and_outputs(sh, ir_var_shader_in);
@@ -3097,12 +3811,10 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 
       gl_shader *const sh_i = prog->_LinkedShaders[i];
       gl_shader *const sh_next = prog->_LinkedShaders[next];
-      unsigned gs_input_vertices =
-         next == MESA_SHADER_GEOMETRY ? prog->Geom.VerticesIn : 0;
 
       if (!assign_varying_locations(ctx, mem_ctx, prog, sh_i, sh_next,
                 next == MESA_SHADER_FRAGMENT ? num_tfeedback_decls : 0,
-                tfeedback_decls, gs_input_vertices))
+                tfeedback_decls))
          goto done;
 
       do_dead_builtin_varyings(ctx, sh_i, sh_next,
@@ -3136,7 +3848,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    link_assign_atomic_counter_resources(ctx, prog);
    store_fragdepth_layout(prog);
 
+   link_calculate_subroutine_compat(ctx, prog);
    check_resources(ctx, prog);
+   check_subroutine_resources(ctx, prog);
    check_image_resources(ctx, prog);
    link_check_atomic_counter_resources(ctx, prog);
 
@@ -3157,10 +3871,6 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
    }
 
-   build_program_resource_list(ctx, prog);
-   if (!prog->LinkStatus)
-      goto done;
-
    /* FINISHME: Assign fragment shader output locations. */
 
 done:
diff --git a/src/glsl/loop_unroll.cpp b/src/glsl/loop_unroll.cpp
index 635e1dd99cd..b9ea3507782 100644
--- a/src/glsl/loop_unroll.cpp
+++ b/src/glsl/loop_unroll.cpp
@@ -100,6 +100,18 @@ public:
 
    virtual ir_visitor_status visit_enter(ir_dereference_array *ir)
    {
+      /* Force unroll in case of dynamic indexing with sampler arrays
+       * when EmitNoIndirectSampler is set.
+       */
+      if (options->EmitNoIndirectSampler) {
+         if ((ir->array->type->is_array() &&
+              ir->array->type->contains_sampler()) &&
+             !ir->array_index->constant_expression_value()) {
+            unsupported_variable_indexing = true;
+            return visit_continue;
+         }
+      }
+
       /* Check for arrays variably-indexed by a loop induction variable.
        * Unrolling the loop may convert that access into constant-indexing.
        *
@@ -133,6 +145,7 @@ public:
                   unsupported_variable_indexing = true;
                break;
             case ir_var_uniform:
+            case ir_var_shader_storage:
                if (options->EmitNoIndirectUniform)
                   unsupported_variable_indexing = true;
                break;
diff --git a/src/glsl/lower_clip_distance.cpp b/src/glsl/lower_clip_distance.cpp
index 01f028b1f37..1ada215796c 100644
--- a/src/glsl/lower_clip_distance.cpp
+++ b/src/glsl/lower_clip_distance.cpp
@@ -55,9 +55,9 @@ namespace {
 class lower_clip_distance_visitor : public ir_rvalue_visitor {
 public:
    explicit lower_clip_distance_visitor(gl_shader_stage shader_stage)
-      : progress(false), old_clip_distance_1d_var(NULL),
-        old_clip_distance_2d_var(NULL), new_clip_distance_1d_var(NULL),
-        new_clip_distance_2d_var(NULL), shader_stage(shader_stage)
+      : progress(false), old_clip_distance_out_var(NULL),
+        old_clip_distance_in_var(NULL), new_clip_distance_out_var(NULL),
+        new_clip_distance_in_var(NULL), shader_stage(shader_stage)
    {
    }
 
@@ -80,20 +80,21 @@ public:
     *
     * Note:
     *
-    * - the 2d_var is for geometry shader input only.
+    * - the in_var is for geometry and both tessellation shader inputs only.
     *
-    * - since gl_ClipDistance is available in geometry shaders as both an
-    *   input and an output, it's possible for both old_clip_distance_1d_var
-    *   and old_clip_distance_2d_var to be non-null.
+    * - since gl_ClipDistance is available in tessellation control,
+    *   tessellation evaluation and geometry shaders as both an input
+    *   and an output, it's possible for both old_clip_distance_out_var
+    *   and old_clip_distance_in_var to be non-null.
     */
-   ir_variable *old_clip_distance_1d_var;
-   ir_variable *old_clip_distance_2d_var;
+   ir_variable *old_clip_distance_out_var;
+   ir_variable *old_clip_distance_in_var;
 
    /**
     * Pointer to the newly-created gl_ClipDistanceMESA variable.
     */
-   ir_variable *new_clip_distance_1d_var;
-   ir_variable *new_clip_distance_2d_var;
+   ir_variable *new_clip_distance_out_var;
+   ir_variable *new_clip_distance_in_var;
 
    /**
     * Type of shader we are compiling (e.g. MESA_SHADER_VERTEX)
@@ -110,62 +111,81 @@ public:
 ir_visitor_status
 lower_clip_distance_visitor::visit(ir_variable *ir)
 {
+   ir_variable **old_var;
+   ir_variable **new_var;
+
    if (!ir->name || strcmp(ir->name, "gl_ClipDistance") != 0)
       return visit_continue;
    assert (ir->type->is_array());
 
-   if (!ir->type->fields.array->is_array()) {
-      /* 1D gl_ClipDistance (used for vertex and geometry output, and fragment
-       * input).
-       */
-      if (this->old_clip_distance_1d_var)
+   if (ir->data.mode == ir_var_shader_out) {
+      if (this->old_clip_distance_out_var)
          return visit_continue;
+      old_var = &old_clip_distance_out_var;
+      new_var = &new_clip_distance_out_var;
+   } else if (ir->data.mode == ir_var_shader_in) {
+      if (this->old_clip_distance_in_var)
+         return visit_continue;
+      old_var = &old_clip_distance_in_var;
+      new_var = &new_clip_distance_in_var;
+   } else {
+      unreachable("not reached");
+   }
 
-      this->progress = true;
-      this->old_clip_distance_1d_var = ir;
+   this->progress = true;
+
+   if (!ir->type->fields.array->is_array()) {
+      /* gl_ClipDistance (used for vertex, tessellation evaluation and
+       * geometry output, and fragment input).
+       */
+      assert((ir->data.mode == ir_var_shader_in &&
+              this->shader_stage == MESA_SHADER_FRAGMENT) ||
+             (ir->data.mode == ir_var_shader_out &&
+              (this->shader_stage == MESA_SHADER_VERTEX ||
+               this->shader_stage == MESA_SHADER_TESS_EVAL ||
+               this->shader_stage == MESA_SHADER_GEOMETRY)));
+
+      *old_var = ir;
       assert (ir->type->fields.array == glsl_type::float_type);
       unsigned new_size = (ir->type->array_size() + 3) / 4;
 
       /* Clone the old var so that we inherit all of its properties */
-      this->new_clip_distance_1d_var = ir->clone(ralloc_parent(ir), NULL);
+      *new_var = ir->clone(ralloc_parent(ir), NULL);
 
       /* And change the properties that we need to change */
-      this->new_clip_distance_1d_var->name
-         = ralloc_strdup(this->new_clip_distance_1d_var,
-                         "gl_ClipDistanceMESA");
-      this->new_clip_distance_1d_var->type
-         = glsl_type::get_array_instance(glsl_type::vec4_type, new_size);
-      this->new_clip_distance_1d_var->data.max_array_access
-         = ir->data.max_array_access / 4;
+      (*new_var)->name = ralloc_strdup(*new_var, "gl_ClipDistanceMESA");
+      (*new_var)->type = glsl_type::get_array_instance(glsl_type::vec4_type,
+                                                       new_size);
+      (*new_var)->data.max_array_access = ir->data.max_array_access / 4;
 
-      ir->replace_with(this->new_clip_distance_1d_var);
+      ir->replace_with(*new_var);
    } else {
-      /* 2D gl_ClipDistance (used for geometry input). */
-      assert(ir->data.mode == ir_var_shader_in &&
-             this->shader_stage == MESA_SHADER_GEOMETRY);
-      if (this->old_clip_distance_2d_var)
-         return visit_continue;
+      /* 2D gl_ClipDistance (used for tessellation control, tessellation
+       * evaluation and geometry input, and tessellation control output).
+       */
+      assert((ir->data.mode == ir_var_shader_in &&
+              (this->shader_stage == MESA_SHADER_GEOMETRY ||
+               this->shader_stage == MESA_SHADER_TESS_EVAL)) ||
+             this->shader_stage == MESA_SHADER_TESS_CTRL);
 
-      this->progress = true;
-      this->old_clip_distance_2d_var = ir;
+      *old_var = ir;
       assert (ir->type->fields.array->fields.array == glsl_type::float_type);
       unsigned new_size = (ir->type->fields.array->array_size() + 3) / 4;
 
       /* Clone the old var so that we inherit all of its properties */
-      this->new_clip_distance_2d_var = ir->clone(ralloc_parent(ir), NULL);
+      *new_var = ir->clone(ralloc_parent(ir), NULL);
 
       /* And change the properties that we need to change */
-      this->new_clip_distance_2d_var->name
-         = ralloc_strdup(this->new_clip_distance_2d_var, "gl_ClipDistanceMESA");
-      this->new_clip_distance_2d_var->type = glsl_type::get_array_instance(
+      (*new_var)->name = ralloc_strdup(*new_var, "gl_ClipDistanceMESA");
+      (*new_var)->type = glsl_type::get_array_instance(
          glsl_type::get_array_instance(glsl_type::vec4_type,
             new_size),
          ir->type->array_size());
-      this->new_clip_distance_2d_var->data.max_array_access
-         = ir->data.max_array_access / 4;
+      (*new_var)->data.max_array_access = ir->data.max_array_access / 4;
 
-      ir->replace_with(this->new_clip_distance_2d_var);
+      ir->replace_with(*new_var);
    }
+
    return visit_continue;
 }
 
@@ -242,26 +262,27 @@ lower_clip_distance_visitor::is_clip_distance_vec8(ir_rvalue *ir)
 {
    /* Note that geometry shaders contain gl_ClipDistance both as an input
     * (which is a 2D array) and an output (which is a 1D array), so it's
-    * possible for both this->old_clip_distance_1d_var and
-    * this->old_clip_distance_2d_var to be non-NULL in the same shader.
+    * possible for both this->old_clip_distance_out_var and
+    * this->old_clip_distance_in_var to be non-NULL in the same shader.
     */
 
-   if (this->old_clip_distance_1d_var) {
-      ir_dereference_variable *var_ref = ir->as_dereference_variable();
-      if (var_ref && var_ref->var == this->old_clip_distance_1d_var)
+   if (!ir->type->is_array())
+      return false;
+   if (ir->type->fields.array != glsl_type::float_type)
+      return false;
+
+   if (this->old_clip_distance_out_var) {
+      if (ir->variable_referenced() == this->old_clip_distance_out_var)
          return true;
    }
-   if (this->old_clip_distance_2d_var) {
-      /* 2D clip distance is only possible as a geometry input */
-      assert(this->shader_stage == MESA_SHADER_GEOMETRY);
+   if (this->old_clip_distance_in_var) {
+      assert(this->shader_stage == MESA_SHADER_TESS_CTRL ||
+             this->shader_stage == MESA_SHADER_TESS_EVAL ||
+             this->shader_stage == MESA_SHADER_GEOMETRY ||
+             this->shader_stage == MESA_SHADER_FRAGMENT);
 
-      ir_dereference_array *array_ref = ir->as_dereference_array();
-      if (array_ref) {
-         ir_dereference_variable *var_ref =
-            array_ref->array->as_dereference_variable();
-         if (var_ref && var_ref->var == this->old_clip_distance_2d_var)
-            return true;
-      }
+      if (ir->variable_referenced() == this->old_clip_distance_in_var)
+         return true;
    }
    return false;
 }
@@ -279,29 +300,33 @@ lower_clip_distance_visitor::is_clip_distance_vec8(ir_rvalue *ir)
 ir_rvalue *
 lower_clip_distance_visitor::lower_clip_distance_vec8(ir_rvalue *ir)
 {
-   if (this->old_clip_distance_1d_var) {
-      ir_dereference_variable *var_ref = ir->as_dereference_variable();
-      if (var_ref && var_ref->var == this->old_clip_distance_1d_var) {
-         return new(ralloc_parent(ir))
-            ir_dereference_variable(this->new_clip_distance_1d_var);
-      }
-   }
-   if (this->old_clip_distance_2d_var) {
-      /* 2D clip distance is only possible as a geometry input */
-      assert(this->shader_stage == MESA_SHADER_GEOMETRY);
+   if (!ir->type->is_array())
+      return NULL;
+   if (ir->type->fields.array != glsl_type::float_type)
+      return NULL;
 
-      ir_dereference_array *array_ref = ir->as_dereference_array();
-      if (array_ref) {
-         ir_dereference_variable *var_ref =
-            array_ref->array->as_dereference_variable();
-         if (var_ref && var_ref->var == this->old_clip_distance_2d_var) {
-            return new(ralloc_parent(ir))
-               ir_dereference_array(this->new_clip_distance_2d_var,
-                                    array_ref->array_index);
-         }
-      }
+   ir_variable **new_var = NULL;
+   if (this->old_clip_distance_out_var) {
+      if (ir->variable_referenced() == this->old_clip_distance_out_var)
+         new_var = &this->new_clip_distance_out_var;
+   }
+   if (this->old_clip_distance_in_var) {
+      if (ir->variable_referenced() == this->old_clip_distance_in_var)
+         new_var = &this->new_clip_distance_in_var;
+   }
+   if (new_var == NULL)
+      return NULL;
+
+   if (ir->as_dereference_variable()) {
+      return new(ralloc_parent(ir)) ir_dereference_variable(*new_var);
+   } else {
+      ir_dereference_array *array_ref = ir->as_dereference_array();
+      assert(array_ref);
+      assert(array_ref->array->as_dereference_variable());
+
+      return new(ralloc_parent(ir))
+         ir_dereference_array(*new_var, array_ref->array_index);
    }
-   return NULL;
 }
 
 
@@ -540,10 +565,10 @@ lower_clip_distance(gl_shader *shader)
 
    visit_list_elements(&v, shader->ir);
 
-   if (v.new_clip_distance_1d_var)
-      shader->symbols->add_variable(v.new_clip_distance_1d_var);
-   if (v.new_clip_distance_2d_var)
-      shader->symbols->add_variable(v.new_clip_distance_2d_var);
+   if (v.new_clip_distance_out_var)
+      shader->symbols->add_variable(v.new_clip_distance_out_var);
+   if (v.new_clip_distance_in_var)
+      shader->symbols->add_variable(v.new_clip_distance_in_var);
 
    return v.progress;
 }
diff --git a/src/glsl/lower_named_interface_blocks.cpp b/src/glsl/lower_named_interface_blocks.cpp
index 7304c51399a..01bbdd0587e 100644
--- a/src/glsl/lower_named_interface_blocks.cpp
+++ b/src/glsl/lower_named_interface_blocks.cpp
@@ -108,7 +108,8 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
        * but, this will require changes to the other uniform block
        * support code.
        */
-      if (var->data.mode == ir_var_uniform)
+      if (var->data.mode == ir_var_uniform ||
+          var->data.mode == ir_var_shader_storage)
          continue;
 
       const glsl_type * iface_t = var->type;
@@ -125,7 +126,8 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
       for (unsigned i = 0; i < iface_t->length; i++) {
          const char * field_name = iface_t->fields.structure[i].name;
          char *iface_field_name =
-            ralloc_asprintf(mem_ctx, "%s.%s.%s",
+            ralloc_asprintf(mem_ctx, "%s %s.%s.%s",
+                            var->data.mode == ir_var_shader_in ? "in" : "out",
                             iface_t->name, var->name, field_name);
 
          ir_variable *found_var =
@@ -158,6 +160,7 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                iface_t->fields.structure[i].interpolation;
             new_var->data.centroid = iface_t->fields.structure[i].centroid;
             new_var->data.sample = iface_t->fields.structure[i].sample;
+            new_var->data.patch = iface_t->fields.structure[i].patch;
 
             new_var->init_interface_type(iface_t);
             hash_table_insert(interface_namespace, new_var,
@@ -212,12 +215,14 @@ flatten_named_interface_blocks_declarations::handle_rvalue(ir_rvalue **rvalue)
     * but, this will require changes to the other uniform block
     * support code.
     */
-   if (var->data.mode == ir_var_uniform)
+   if (var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage)
       return;
 
    if (var->get_interface_type() != NULL) {
       char *iface_field_name =
-         ralloc_asprintf(mem_ctx, "%s.%s.%s", var->get_interface_type()->name,
+         ralloc_asprintf(mem_ctx, "%s %s.%s.%s",
+                         var->data.mode == ir_var_shader_in ? "in" : "out",
+                         var->get_interface_type()->name,
                          var->name, ir->field);
       /* Find the variable in the set of flattened interface blocks */
       ir_variable *found_var =
diff --git a/src/glsl/lower_output_reads.cpp b/src/glsl/lower_output_reads.cpp
index 1ee815d5ece..79488df2932 100644
--- a/src/glsl/lower_output_reads.cpp
+++ b/src/glsl/lower_output_reads.cpp
@@ -48,8 +48,10 @@ protected:
    hash_table *replacements;
 
    void *mem_ctx;
+
+   unsigned stage;
 public:
-   output_read_remover();
+   output_read_remover(unsigned stage);
    ~output_read_remover();
    virtual ir_visitor_status visit(class ir_dereference_variable *);
    virtual ir_visitor_status visit_leave(class ir_emit_vertex *);
@@ -75,8 +77,9 @@ hash_table_var_hash(const void *key)
    return hash_table_string_hash(var->name);
 }
 
-output_read_remover::output_read_remover()
+output_read_remover::output_read_remover(unsigned stage)
 {
+   this->stage = stage;
    mem_ctx = ralloc_context(NULL);
    replacements =
       hash_table_ctor(0, hash_table_var_hash, hash_table_pointer_compare);
@@ -93,6 +96,8 @@ output_read_remover::visit(ir_dereference_variable *ir)
 {
    if (ir->var->data.mode != ir_var_shader_out)
       return visit_continue;
+   if (stage == MESA_SHADER_TESS_CTRL)
+      return visit_continue;
 
    ir_variable *temp = (ir_variable *) hash_table_find(replacements, ir->var);
 
@@ -166,8 +171,8 @@ output_read_remover::visit_leave(ir_function_signature *sig)
 }
 
 void
-lower_output_reads(exec_list *instructions)
+lower_output_reads(unsigned stage, exec_list *instructions)
 {
-   output_read_remover v;
+   output_read_remover v(stage);
    visit_list_elements(&v, instructions);
 }
diff --git a/src/glsl/lower_packed_varyings.cpp b/src/glsl/lower_packed_varyings.cpp
index d8bebb52235..cfe414ae088 100644
--- a/src/glsl/lower_packed_varyings.cpp
+++ b/src/glsl/lower_packed_varyings.cpp
@@ -610,6 +610,7 @@ lower_packed_varyings_visitor::get_packed_varying_deref(
       }
       packed_var->data.centroid = unpacked_var->data.centroid;
       packed_var->data.sample = unpacked_var->data.sample;
+      packed_var->data.patch = unpacked_var->data.patch;
       packed_var->data.interpolation = unpacked_var->data.interpolation;
       packed_var->data.location = location;
       unpacked_var->insert_before(packed_var);
diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp
new file mode 100644
index 00000000000..b29912ad150
--- /dev/null
+++ b/src/glsl/lower_subroutine.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2015 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file lower_subroutine.cpp
+ *
+ * lowers subroutines to an if ladder.
+ */
+
+#include "glsl_types.h"
+#include "glsl_parser_extras.h"
+#include "ir.h"
+#include "ir_builder.h"
+
+using namespace ir_builder;
+namespace {
+
+class lower_subroutine_visitor : public ir_hierarchical_visitor {
+public:
+   lower_subroutine_visitor(struct _mesa_glsl_parse_state *state)
+      : state(state)
+   {
+      this->progress = false;
+   }
+
+   ir_visitor_status visit_leave(ir_call *);
+   bool progress;
+   struct _mesa_glsl_parse_state *state;
+};
+
+}
+
+bool
+lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state)
+{
+   lower_subroutine_visitor v(state);
+   visit_list_elements(&v, instructions);
+   return v.progress;
+}
+
+ir_visitor_status
+lower_subroutine_visitor::visit_leave(ir_call *ir)
+{
+   if (!ir->sub_var)
+      return visit_continue;
+
+   void *mem_ctx = ralloc_parent(ir);
+   ir_if *last_branch = NULL;
+   ir_dereference_variable *return_deref = ir->return_deref;
+
+   for (int s = this->state->num_subroutines - 1; s >= 0; s--) {
+      ir_rvalue *var;
+      ir_constant *lc = new(mem_ctx)ir_constant(s);
+      ir_function *fn = this->state->subroutines[s];
+      bool is_compat = false;
+
+      for (int i = 0; i < fn->num_subroutine_types; i++) {
+         if (ir->sub_var->type->without_array() == fn->subroutine_types[i]) {
+            is_compat = true;
+            break;
+         }
+      }
+      if (is_compat == false)
+         continue;
+
+      if (ir->array_idx != NULL)
+         var = new(mem_ctx) ir_dereference_array(ir->sub_var, ir->array_idx->clone(mem_ctx, NULL));
+      else
+         var = new(mem_ctx) ir_dereference_variable(ir->sub_var);
+
+      ir_function_signature *sub_sig =
+         fn->exact_matching_signature(this->state,
+                                      &ir->actual_parameters);
+
+      ir_call *new_call = new(mem_ctx) ir_call(sub_sig, return_deref, &ir->actual_parameters);
+      if (!last_branch)
+         last_branch = if_tree(equal(subr_to_int(var), lc), new_call);
+      else
+         last_branch = if_tree(equal(subr_to_int(var), lc), new_call, last_branch);
+
+      if (s > 0)
+        return_deref = return_deref->clone(mem_ctx, NULL);
+   }
+   if (last_branch)
+      ir->insert_before(last_branch);
+   ir->remove();
+
+   return visit_continue;
+}
diff --git a/src/glsl/lower_tess_level.cpp b/src/glsl/lower_tess_level.cpp
new file mode 100644
index 00000000000..bed2553222f
--- /dev/null
+++ b/src/glsl/lower_tess_level.cpp
@@ -0,0 +1,459 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file lower_tess_level.cpp
+ *
+ * This pass accounts for the difference between the way gl_TessLevelOuter
+ * and gl_TessLevelInner is declared in standard GLSL (as an array of
+ * floats), and the way it is frequently implemented in hardware (as a vec4
+ * and vec2).
+ *
+ * The declaration of gl_TessLevel* is replaced with a declaration
+ * of gl_TessLevel*MESA, and any references to gl_TessLevel* are
+ * translated to refer to gl_TessLevel*MESA with the appropriate
+ * swizzling of array indices.  For instance:
+ *
+ *   gl_TessLevelOuter[i]
+ *
+ * is translated into:
+ *
+ *   gl_TessLevelOuterMESA[i]
+ *
+ * Since some hardware may not internally represent gl_TessLevel* as a pair
+ * of vec4's, this lowering pass is optional.  To enable it, set the
+ * LowerTessLevel flag in gl_shader_compiler_options to true.
+ */
+
+#include "glsl_symbol_table.h"
+#include "ir_rvalue_visitor.h"
+#include "ir.h"
+#include "program/prog_instruction.h" /* For WRITEMASK_* */
+
+namespace {
+
+class lower_tess_level_visitor : public ir_rvalue_visitor {
+public:
+   explicit lower_tess_level_visitor(gl_shader_stage shader_stage)
+      : progress(false), old_tess_level_outer_var(NULL),
+        old_tess_level_inner_var(NULL), new_tess_level_outer_var(NULL),
+        new_tess_level_inner_var(NULL), shader_stage(shader_stage)
+   {
+   }
+
+   virtual ir_visitor_status visit(ir_variable *);
+   bool is_tess_level_array(ir_rvalue *ir);
+   ir_rvalue *lower_tess_level_array(ir_rvalue *ir);
+   virtual ir_visitor_status visit_leave(ir_assignment *);
+   void visit_new_assignment(ir_assignment *ir);
+   virtual ir_visitor_status visit_leave(ir_call *);
+
+   virtual void handle_rvalue(ir_rvalue **rvalue);
+
+   void fix_lhs(ir_assignment *);
+
+   bool progress;
+
+   /**
+    * Pointer to the declaration of gl_TessLevel*, if found.
+    */
+   ir_variable *old_tess_level_outer_var;
+   ir_variable *old_tess_level_inner_var;
+
+   /**
+    * Pointer to the newly-created gl_TessLevel*MESA variables.
+    */
+   ir_variable *new_tess_level_outer_var;
+   ir_variable *new_tess_level_inner_var;
+
+   /**
+    * Type of shader we are compiling (e.g. MESA_SHADER_TESS_CTRL)
+    */
+   const gl_shader_stage shader_stage;
+};
+
+} /* anonymous namespace */
+
+/**
+ * Replace any declaration of gl_TessLevel* as an array of floats with a
+ * declaration of gl_TessLevel*MESA as a vec4.
+ */
+ir_visitor_status
+lower_tess_level_visitor::visit(ir_variable *ir)
+{
+   if ((!ir->name) ||
+       ((strcmp(ir->name, "gl_TessLevelInner") != 0) &&
+        (strcmp(ir->name, "gl_TessLevelOuter") != 0)))
+      return visit_continue;
+
+   assert (ir->type->is_array());
+
+   if (strcmp(ir->name, "gl_TessLevelOuter") == 0) {
+      if (this->old_tess_level_outer_var)
+         return visit_continue;
+
+      old_tess_level_outer_var = ir;
+      assert(ir->type->fields.array == glsl_type::float_type);
+
+      /* Clone the old var so that we inherit all of its properties */
+      new_tess_level_outer_var = ir->clone(ralloc_parent(ir), NULL);
+
+      /* And change the properties that we need to change */
+      new_tess_level_outer_var->name = ralloc_strdup(new_tess_level_outer_var,
+                                                "gl_TessLevelOuterMESA");
+      new_tess_level_outer_var->type = glsl_type::vec4_type;
+      new_tess_level_outer_var->data.max_array_access = 0;
+
+      ir->replace_with(new_tess_level_outer_var);
+   } else if (strcmp(ir->name, "gl_TessLevelInner") == 0) {
+      if (this->old_tess_level_inner_var)
+         return visit_continue;
+
+      old_tess_level_inner_var = ir;
+      assert(ir->type->fields.array == glsl_type::float_type);
+
+      /* Clone the old var so that we inherit all of its properties */
+      new_tess_level_inner_var = ir->clone(ralloc_parent(ir), NULL);
+
+      /* And change the properties that we need to change */
+      new_tess_level_inner_var->name = ralloc_strdup(new_tess_level_inner_var,
+                                                "gl_TessLevelInnerMESA");
+      new_tess_level_inner_var->type = glsl_type::vec2_type;
+      new_tess_level_inner_var->data.max_array_access = 0;
+
+      ir->replace_with(new_tess_level_inner_var);
+   } else {
+      assert(0);
+   }
+
+   this->progress = true;
+
+   return visit_continue;
+}
+
+
+/**
+ * Determine whether the given rvalue describes an array of floats that
+ * needs to be lowered to a vec4; that is, determine whether it
+ * matches one of the following patterns:
+ *
+ * - gl_TessLevelOuter
+ * - gl_TessLevelInner
+ */
+bool
+lower_tess_level_visitor::is_tess_level_array(ir_rvalue *ir)
+{
+   if (!ir->type->is_array())
+      return false;
+   if (ir->type->fields.array != glsl_type::float_type)
+      return false;
+
+   if (this->old_tess_level_outer_var) {
+      if (ir->variable_referenced() == this->old_tess_level_outer_var)
+         return true;
+   }
+   if (this->old_tess_level_inner_var) {
+      if (ir->variable_referenced() == this->old_tess_level_inner_var)
+         return true;
+   }
+   return false;
+}
+
+
+/**
+ * If the given ir satisfies is_tess_level_array(), return new ir
+ * representing its lowered equivalent.  That is, map:
+ *
+ * - gl_TessLevelOuter => gl_TessLevelOuterMESA
+ * - gl_TessLevelInner => gl_TessLevelInnerMESA
+ *
+ * Otherwise return NULL.
+ */
+ir_rvalue *
+lower_tess_level_visitor::lower_tess_level_array(ir_rvalue *ir)
+{
+   if (!ir->type->is_array())
+      return NULL;
+   if (ir->type->fields.array != glsl_type::float_type)
+      return NULL;
+
+   ir_variable **new_var = NULL;
+
+   if (this->old_tess_level_outer_var) {
+      if (ir->variable_referenced() == this->old_tess_level_outer_var)
+         new_var = &this->new_tess_level_outer_var;
+   }
+   if (this->old_tess_level_inner_var) {
+      if (ir->variable_referenced() == this->old_tess_level_inner_var)
+         new_var = &this->new_tess_level_inner_var;
+   }
+
+   if (new_var == NULL)
+      return NULL;
+
+   assert(ir->as_dereference_variable());
+   return new(ralloc_parent(ir)) ir_dereference_variable(*new_var);
+}
+
+
+void
+lower_tess_level_visitor::handle_rvalue(ir_rvalue **rv)
+{
+   if (*rv == NULL)
+      return;
+
+   ir_dereference_array *const array_deref = (*rv)->as_dereference_array();
+   if (array_deref == NULL)
+      return;
+
+   /* Replace any expression that indexes one of the floats in gl_TessLevel*
+    * with an expression that indexes into one of the vec4's
+    * gl_TessLevel*MESA and accesses the appropriate component.
+    */
+   ir_rvalue *lowered_vec4 =
+      this->lower_tess_level_array(array_deref->array);
+   if (lowered_vec4 != NULL) {
+      this->progress = true;
+      void *mem_ctx = ralloc_parent(array_deref);
+
+      ir_expression *const expr =
+         new(mem_ctx) ir_expression(ir_binop_vector_extract,
+                                    lowered_vec4,
+                                    array_deref->array_index);
+
+      *rv = expr;
+   }
+}
+
+void
+lower_tess_level_visitor::fix_lhs(ir_assignment *ir)
+{
+   if (ir->lhs->ir_type != ir_type_expression)
+      return;
+   void *mem_ctx = ralloc_parent(ir);
+   ir_expression *const expr = (ir_expression *) ir->lhs;
+
+   /* The expression must be of the form:
+    *
+    *     (vector_extract gl_TessLevel*MESA, j).
+    */
+   assert(expr->operation == ir_binop_vector_extract);
+   assert(expr->operands[0]->ir_type == ir_type_dereference_variable);
+   assert((expr->operands[0]->type == glsl_type::vec4_type) ||
+          (expr->operands[0]->type == glsl_type::vec2_type));
+
+   ir_dereference *const new_lhs = (ir_dereference *) expr->operands[0];
+
+   ir_constant *old_index_constant = expr->operands[1]->constant_expression_value();
+   if (!old_index_constant) {
+      ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert,
+                                           expr->operands[0]->type,
+                                           new_lhs->clone(mem_ctx, NULL),
+                                           ir->rhs,
+                                           expr->operands[1]);
+   }
+   ir->set_lhs(new_lhs);
+
+   if (old_index_constant) {
+      /* gl_TessLevel* is being accessed via a constant index.  Don't bother
+       * creating a vector insert op. Just use a write mask.
+       */
+      ir->write_mask = 1 << old_index_constant->get_int_component(0);
+   } else {
+      ir->write_mask = (1 << expr->operands[0]->type->vector_elements) - 1;
+   }
+}
+
+/**
+ * Replace any assignment having a gl_TessLevel* (undereferenced) as
+ * its LHS or RHS with a sequence of assignments, one for each component of
+ * the array.  Each of these assignments is lowered to refer to
+ * gl_TessLevel*MESA as appropriate.
+ */
+ir_visitor_status
+lower_tess_level_visitor::visit_leave(ir_assignment *ir)
+{
+   /* First invoke the base class visitor.  This causes handle_rvalue() to be
+    * called on ir->rhs and ir->condition.
+    */
+   ir_rvalue_visitor::visit_leave(ir);
+
+   if (this->is_tess_level_array(ir->lhs) ||
+       this->is_tess_level_array(ir->rhs)) {
+      /* LHS or RHS of the assignment is the entire gl_TessLevel* array.
+       * Since we are
+       * reshaping gl_TessLevel* from an array of floats to a
+       * vec4, this isn't going to work as a bulk assignment anymore, so
+       * unroll it to element-by-element assignments and lower each of them.
+       *
+       * Note: to unroll into element-by-element assignments, we need to make
+       * clones of the LHS and RHS.  This is safe because expressions and
+       * l-values are side-effect free.
+       */
+      void *ctx = ralloc_parent(ir);
+      int array_size = ir->lhs->type->array_size();
+      for (int i = 0; i < array_size; ++i) {
+         ir_dereference_array *new_lhs = new(ctx) ir_dereference_array(
+            ir->lhs->clone(ctx, NULL), new(ctx) ir_constant(i));
+         ir_dereference_array *new_rhs = new(ctx) ir_dereference_array(
+            ir->rhs->clone(ctx, NULL), new(ctx) ir_constant(i));
+         this->handle_rvalue((ir_rvalue **) &new_rhs);
+
+         /* Handle the LHS after creating the new assignment.  This must
+          * happen in this order because handle_rvalue may replace the old LHS
+          * with an ir_expression of ir_binop_vector_extract.  Since this is
+          * not a valide l-value, this will cause an assertion in the
+          * ir_assignment constructor to fail.
+          *
+          * If this occurs, replace the mangled LHS with a dereference of the
+          * vector, and replace the RHS with an ir_triop_vector_insert.
+          */
+         ir_assignment *const assign = new(ctx) ir_assignment(new_lhs, new_rhs);
+         this->handle_rvalue((ir_rvalue **) &assign->lhs);
+         this->fix_lhs(assign);
+
+         this->base_ir->insert_before(assign);
+      }
+      ir->remove();
+
+      return visit_continue;
+   }
+
+   /* Handle the LHS as if it were an r-value.  Normally
+    * rvalue_visit(ir_assignment *) only visits the RHS, but we need to lower
+    * expressions in the LHS as well.
+    *
+    * This may cause the LHS to get replaced with an ir_expression of
+    * ir_binop_vector_extract.  If this occurs, replace it with a dereference
+    * of the vector, and replace the RHS with an ir_triop_vector_insert.
+    */
+   handle_rvalue((ir_rvalue **)&ir->lhs);
+   this->fix_lhs(ir);
+
+   return rvalue_visit(ir);
+}
+
+
+/**
+ * Set up base_ir properly and call visit_leave() on a newly created
+ * ir_assignment node.  This is used in cases where we have to insert an
+ * ir_assignment in a place where we know the hierarchical visitor won't see
+ * it.
+ */
+void
+lower_tess_level_visitor::visit_new_assignment(ir_assignment *ir)
+{
+   ir_instruction *old_base_ir = this->base_ir;
+   this->base_ir = ir;
+   ir->accept(this);
+   this->base_ir = old_base_ir;
+}
+
+
+/**
+ * If a gl_TessLevel* variable appears as an argument in an ir_call
+ * expression, replace it with a temporary variable, and make sure the ir_call
+ * is preceded and/or followed by assignments that copy the contents of the
+ * temporary variable to and/or from gl_TessLevel*.  Each of these
+ * assignments is then lowered to refer to gl_TessLevel*MESA.
+ */
+ir_visitor_status
+lower_tess_level_visitor::visit_leave(ir_call *ir)
+{
+   void *ctx = ralloc_parent(ir);
+
+   const exec_node *formal_param_node = ir->callee->parameters.head;
+   const exec_node *actual_param_node = ir->actual_parameters.head;
+   while (!actual_param_node->is_tail_sentinel()) {
+      ir_variable *formal_param = (ir_variable *) formal_param_node;
+      ir_rvalue *actual_param = (ir_rvalue *) actual_param_node;
+
+      /* Advance formal_param_node and actual_param_node now so that we can
+       * safely replace actual_param with another node, if necessary, below.
+       */
+      formal_param_node = formal_param_node->next;
+      actual_param_node = actual_param_node->next;
+
+      if (!this->is_tess_level_array(actual_param))
+         continue;
+
+      /* User is trying to pass a whole gl_TessLevel* array to a function
+       * call.  Since we are reshaping gl_TessLevel* from an array of floats
+       * to a vec4, this isn't going to work anymore, so use a temporary
+       * array instead.
+       */
+      ir_variable *temp = new(ctx) ir_variable(
+         actual_param->type, "temp_tess_level", ir_var_temporary);
+      this->base_ir->insert_before(temp);
+      actual_param->replace_with(
+         new(ctx) ir_dereference_variable(temp));
+      if (formal_param->data.mode == ir_var_function_in
+          || formal_param->data.mode == ir_var_function_inout) {
+         /* Copy from gl_TessLevel* to the temporary before the call.
+          * Since we are going to insert this copy before the current
+          * instruction, we need to visit it afterwards to make sure it
+          * gets lowered.
+          */
+         ir_assignment *new_assignment = new(ctx) ir_assignment(
+            new(ctx) ir_dereference_variable(temp),
+            actual_param->clone(ctx, NULL));
+         this->base_ir->insert_before(new_assignment);
+         this->visit_new_assignment(new_assignment);
+      }
+      if (formal_param->data.mode == ir_var_function_out
+          || formal_param->data.mode == ir_var_function_inout) {
+         /* Copy from the temporary to gl_TessLevel* after the call.
+          * Since visit_list_elements() has already decided which
+          * instruction it's going to visit next, we need to visit
+          * afterwards to make sure it gets lowered.
+          */
+         ir_assignment *new_assignment = new(ctx) ir_assignment(
+            actual_param->clone(ctx, NULL),
+            new(ctx) ir_dereference_variable(temp));
+         this->base_ir->insert_after(new_assignment);
+         this->visit_new_assignment(new_assignment);
+      }
+   }
+
+   return rvalue_visit(ir);
+}
+
+
+bool
+lower_tess_level(gl_shader *shader)
+{
+   if ((shader->Stage != MESA_SHADER_TESS_CTRL) &&
+       (shader->Stage != MESA_SHADER_TESS_EVAL))
+      return false;
+
+   lower_tess_level_visitor v(shader->Stage);
+
+   visit_list_elements(&v, shader->ir);
+
+   if (v.new_tess_level_outer_var)
+      shader->symbols->add_variable(v.new_tess_level_outer_var);
+   if (v.new_tess_level_inner_var)
+      shader->symbols->add_variable(v.new_tess_level_inner_var);
+
+   return v.progress;
+}
diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index 4ea4ccb03f2..8b0810781fe 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -37,6 +37,7 @@
 #include "ir_builder.h"
 #include "ir_rvalue_visitor.h"
 #include "main/macros.h"
+#include "glsl_parser_extras.h"
 
 using namespace ir_builder;
 
@@ -139,17 +140,38 @@ public:
    }
 
    void handle_rvalue(ir_rvalue **rvalue);
-   void emit_ubo_loads(ir_dereference *deref, ir_variable *base_offset,
-                       unsigned int deref_offset, bool row_major,
-                       int matrix_columns);
+   ir_visitor_status visit_enter(ir_assignment *ir);
+
+   void setup_for_load_or_store(ir_variable *var,
+                                ir_dereference *deref,
+                                ir_rvalue **offset,
+                                unsigned *const_offset,
+                                bool *row_major,
+                                int *matrix_columns);
    ir_expression *ubo_load(const struct glsl_type *type,
 			   ir_rvalue *offset);
+   ir_call *ssbo_load(const struct glsl_type *type,
+                      ir_rvalue *offset);
+
+   void check_for_ssbo_store(ir_assignment *ir);
+   void write_to_memory(ir_dereference *deref,
+                        ir_variable *var,
+                        ir_variable *write_var,
+                        unsigned write_mask);
+   ir_call *ssbo_store(ir_rvalue *deref, ir_rvalue *offset,
+                       unsigned write_mask);
+
+   void emit_access(bool is_write, ir_dereference *deref,
+                    ir_variable *base_offset, unsigned int deref_offset,
+                    bool row_major, int matrix_columns,
+                    unsigned write_mask);
 
    void *mem_ctx;
    struct gl_shader *shader;
    struct gl_uniform_buffer_variable *ubo_var;
    ir_rvalue *uniform_block;
    bool progress;
+   bool is_shader_storage;
 };
 
 /**
@@ -218,26 +240,20 @@ interface_field_name(void *mem_ctx, char *base_name, ir_dereference *d,
 }
 
 void
-lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
+lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var,
+                                                     ir_dereference *deref,
+                                                     ir_rvalue **offset,
+                                                     unsigned *const_offset,
+                                                     bool *row_major,
+                                                     int *matrix_columns)
 {
-   if (!*rvalue)
-      return;
-
-   ir_dereference *deref = (*rvalue)->as_dereference();
-   if (!deref)
-      return;
-
-   ir_variable *var = deref->variable_referenced();
-   if (!var || !var->is_in_uniform_block())
-      return;
-
-   mem_ctx = ralloc_parent(*rvalue);
-
+   /* Determine the name of the interface block */
    ir_rvalue *nonconst_block_index;
    const char *const field_name =
       interface_field_name(mem_ctx, (char *) var->get_interface_type()->name,
                            deref, &nonconst_block_index);
 
+   /* Locate the ubo block by interface name */
    this->uniform_block = NULL;
    for (unsigned i = 0; i < shader->NumUniformBlocks; i++) {
       if (strcmp(field_name, shader->UniformBlocks[i].Name) == 0) {
@@ -252,6 +268,8 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
             this->uniform_block = index;
          }
 
+         this->is_shader_storage = shader->UniformBlocks[i].IsShaderStorage;
+
          struct gl_uniform_block *block = &shader->UniformBlocks[i];
 
          this->ubo_var = var->is_interface_instance()
@@ -263,10 +281,10 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
 
    assert(this->uniform_block);
 
-   ir_rvalue *offset = new(mem_ctx) ir_constant(0u);
-   unsigned const_offset = 0;
-   bool row_major = is_dereferenced_thing_row_major(deref);
-   int matrix_columns = 1;
+   *offset = new(mem_ctx) ir_constant(0u);
+   *const_offset = 0;
+   *row_major = is_dereferenced_thing_row_major(deref);
+   *matrix_columns = 1;
 
    /* Calculate the offset to the start of the region of the UBO
     * dereferenced by *rvalue.  This may be a variable offset if an
@@ -275,76 +293,76 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
    while (deref) {
       switch (deref->ir_type) {
       case ir_type_dereference_variable: {
-	 const_offset += ubo_var->Offset;
-	 deref = NULL;
-	 break;
+         *const_offset += ubo_var->Offset;
+         deref = NULL;
+         break;
       }
 
       case ir_type_dereference_array: {
-	 ir_dereference_array *deref_array = (ir_dereference_array *)deref;
-	 unsigned array_stride;
-	 if (deref_array->array->type->is_matrix() && row_major) {
-	    /* When loading a vector out of a row major matrix, the
-	     * step between the columns (vectors) is the size of a
-	     * float, while the step between the rows (elements of a
-	     * vector) is handled below in emit_ubo_loads.
-	     */
-	    array_stride = 4;
+         ir_dereference_array *deref_array = (ir_dereference_array *) deref;
+         unsigned array_stride;
+         if (deref_array->array->type->is_matrix() && *row_major) {
+            /* When loading a vector out of a row major matrix, the
+             * step between the columns (vectors) is the size of a
+             * float, while the step between the rows (elements of a
+             * vector) is handled below in emit_ubo_loads.
+             */
+            array_stride = 4;
             if (deref_array->array->type->is_double())
                array_stride *= 2;
-            matrix_columns = deref_array->array->type->matrix_columns;
+            *matrix_columns = deref_array->array->type->matrix_columns;
          } else if (deref_array->type->is_interface()) {
             /* We're processing an array dereference of an interface instance
-	     * array.  The thing being dereferenced *must* be a variable
-	     * dereference because intefaces cannot be embedded an other
-	     * types.  In terms of calculating the offsets for the lowering
-	     * pass, we don't care about the array index.  All elements of an
-	     * interface instance array will have the same offsets relative to
-	     * the base of the block that backs them.
+             * array. The thing being dereferenced *must* be a variable
+             * dereference because interfaces cannot be embedded in other
+             * types. In terms of calculating the offsets for the lowering
+             * pass, we don't care about the array index. All elements of an
+             * interface instance array will have the same offsets relative to
+             * the base of the block that backs them.
              */
             assert(deref_array->array->as_dereference_variable());
             deref = deref_array->array->as_dereference();
             break;
-	 } else {
+         } else {
             /* Whether or not the field is row-major (because it might be a
-             * bvec2 or something) does not affect the array itself.  We need
+             * bvec2 or something) does not affect the array itself. We need
              * to know whether an array element in its entirety is row-major.
              */
             const bool array_row_major =
                is_dereferenced_thing_row_major(deref_array);
 
-	    array_stride = deref_array->type->std140_size(array_row_major);
-	    array_stride = glsl_align(array_stride, 16);
-	 }
+            array_stride = deref_array->type->std140_size(array_row_major);
+            array_stride = glsl_align(array_stride, 16);
+         }
 
          ir_rvalue *array_index = deref_array->array_index;
          if (array_index->type->base_type == GLSL_TYPE_INT)
             array_index = i2u(array_index);
 
-	 ir_constant *const_index =
+         ir_constant *const_index =
             array_index->constant_expression_value(NULL);
-	 if (const_index) {
-	    const_offset += array_stride * const_index->value.u[0];
-	 } else {
-	    offset = add(offset,
-			 mul(array_index,
-			     new(mem_ctx) ir_constant(array_stride)));
-	 }
-	 deref = deref_array->array->as_dereference();
-	 break;
+         if (const_index) {
+            *const_offset += array_stride * const_index->value.u[0];
+         } else {
+            *offset = add(*offset,
+                          mul(array_index,
+                              new(mem_ctx) ir_constant(array_stride)));
+         }
+         deref = deref_array->array->as_dereference();
+         break;
       }
 
       case ir_type_dereference_record: {
-	 ir_dereference_record *deref_record = (ir_dereference_record *)deref;
-	 const glsl_type *struct_type = deref_record->record->type;
-	 unsigned intra_struct_offset = 0;
+         ir_dereference_record *deref_record = (ir_dereference_record *) deref;
+         const glsl_type *struct_type = deref_record->record->type;
+         unsigned intra_struct_offset = 0;
 
-	 for (unsigned int i = 0; i < struct_type->length; i++) {
-	    const glsl_type *type = struct_type->fields.structure[i].type;
+         for (unsigned int i = 0; i < struct_type->length; i++) {
+            const glsl_type *type = struct_type->fields.structure[i].type;
 
-            ir_dereference_record *field_deref =
-               new(mem_ctx) ir_dereference_record(deref_record->record,
-                                                  struct_type->fields.structure[i].name);
+            ir_dereference_record *field_deref = new(mem_ctx)
+               ir_dereference_record(deref_record->record,
+                                     struct_type->fields.structure[i].name);
             const bool field_row_major =
                is_dereferenced_thing_row_major(field_deref);
 
@@ -352,11 +370,12 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
 
             unsigned field_align = type->std140_base_alignment(field_row_major);
 
-	    intra_struct_offset = glsl_align(intra_struct_offset, field_align);
+            intra_struct_offset = glsl_align(intra_struct_offset, field_align);
+
+            if (strcmp(struct_type->fields.structure[i].name,
+                       deref_record->field) == 0)
+               break;
 
-	    if (strcmp(struct_type->fields.structure[i].name,
-		       deref_record->field) == 0)
-	       break;
             intra_struct_offset += type->std140_size(field_row_major);
 
             /* If the field just examined was itself a structure, apply rule
@@ -371,19 +390,49 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
                                                 field_align);
 
             }
-	 }
+         }
 
-	 const_offset += intra_struct_offset;
-
-	 deref = deref_record->record->as_dereference();
-	 break;
+         *const_offset += intra_struct_offset;
+         deref = deref_record->record->as_dereference();
+         break;
       }
+
       default:
-	 assert(!"not reached");
-	 deref = NULL;
-	 break;
+         assert(!"not reached");
+         deref = NULL;
+         break;
       }
    }
+}
+
+void
+lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
+{
+   if (!*rvalue)
+      return;
+
+   ir_dereference *deref = (*rvalue)->as_dereference();
+   if (!deref)
+      return;
+
+   ir_variable *var = deref->variable_referenced();
+   if (!var || !var->is_in_buffer_block())
+      return;
+
+   mem_ctx = ralloc_parent(shader->ir);
+
+   ir_rvalue *offset = NULL;
+   unsigned const_offset;
+   bool row_major;
+   int matrix_columns;
+
+   /* Compute the offset to the start if the dereference as well as other
+    * information we need to configure the write
+    */
+   setup_for_load_or_store(var, deref,
+                           &offset, &const_offset,
+                           &row_major, &matrix_columns);
+   assert(offset);
 
    /* Now that we've calculated the offset to the start of the
     * dereference, walk over the type and emit loads into a temporary.
@@ -401,7 +450,8 @@ lower_ubo_reference_visitor::handle_rvalue(ir_rvalue **rvalue)
    base_ir->insert_before(assign(load_offset, offset));
 
    deref = new(mem_ctx) ir_dereference_variable(load_var);
-   emit_ubo_loads(deref, load_offset, const_offset, row_major, matrix_columns);
+   emit_access(false, deref, load_offset, const_offset,
+               row_major, matrix_columns, 0);
    *rvalue = deref;
 
    progress = true;
@@ -420,74 +470,163 @@ lower_ubo_reference_visitor::ubo_load(const glsl_type *type,
 
 }
 
+static bool
+shader_storage_buffer_object(const _mesa_glsl_parse_state *state)
+{
+   return state->ARB_shader_storage_buffer_object_enable;
+}
+
+ir_call *
+lower_ubo_reference_visitor::ssbo_store(ir_rvalue *deref,
+                                        ir_rvalue *offset,
+                                        unsigned write_mask)
+{
+   exec_list sig_params;
+
+   ir_variable *block_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "block_ref" , ir_var_function_in);
+   sig_params.push_tail(block_ref);
+
+   ir_variable *offset_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "offset" , ir_var_function_in);
+   sig_params.push_tail(offset_ref);
+
+   ir_variable *val_ref = new(mem_ctx)
+      ir_variable(deref->type, "value" , ir_var_function_in);
+   sig_params.push_tail(val_ref);
+
+   ir_variable *writemask_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "write_mask" , ir_var_function_in);
+   sig_params.push_tail(writemask_ref);
+
+   ir_function_signature *sig = new(mem_ctx)
+      ir_function_signature(glsl_type::void_type, shader_storage_buffer_object);
+   assert(sig);
+   sig->replace_parameters(&sig_params);
+   sig->is_intrinsic = true;
+
+   ir_function *f = new(mem_ctx) ir_function("__intrinsic_store_ssbo");
+   f->add_signature(sig);
+
+   exec_list call_params;
+   call_params.push_tail(this->uniform_block->clone(mem_ctx, NULL));
+   call_params.push_tail(offset->clone(mem_ctx, NULL));
+   call_params.push_tail(deref->clone(mem_ctx, NULL));
+   call_params.push_tail(new(mem_ctx) ir_constant(write_mask));
+   return new(mem_ctx) ir_call(sig, NULL, &call_params);
+}
+
+ir_call *
+lower_ubo_reference_visitor::ssbo_load(const struct glsl_type *type,
+                                       ir_rvalue *offset)
+{
+   exec_list sig_params;
+
+   ir_variable *block_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "block_ref" , ir_var_function_in);
+   sig_params.push_tail(block_ref);
+
+   ir_variable *offset_ref = new(mem_ctx)
+      ir_variable(glsl_type::uint_type, "offset_ref" , ir_var_function_in);
+   sig_params.push_tail(offset_ref);
+
+   ir_function_signature *sig =
+      new(mem_ctx) ir_function_signature(type, shader_storage_buffer_object);
+   assert(sig);
+   sig->replace_parameters(&sig_params);
+   sig->is_intrinsic = true;
+
+   ir_function *f = new(mem_ctx) ir_function("__intrinsic_load_ssbo");
+   f->add_signature(sig);
+
+   ir_variable *result = new(mem_ctx)
+      ir_variable(type, "ssbo_load_result", ir_var_temporary);
+   base_ir->insert_before(result);
+   ir_dereference_variable *deref_result = new(mem_ctx)
+      ir_dereference_variable(result);
+
+   exec_list call_params;
+   call_params.push_tail(this->uniform_block->clone(mem_ctx, NULL));
+   call_params.push_tail(offset->clone(mem_ctx, NULL));
+
+   return new(mem_ctx) ir_call(sig, deref_result, &call_params);
+}
+
+static inline int
+writemask_for_size(unsigned n)
+{
+   return ((1 << n) - 1);
+}
+
 /**
- * Takes LHS and emits a series of assignments into its components
- * from the UBO variable at variable_offset + deref_offset.
- *
- * Recursively calls itself to break the deref down to the point that
- * the ir_binop_ubo_load expressions generated are contiguous scalars
- * or vectors.
+ * Takes a deref and recursively calls itself to break the deref down to the
+ * point that the reads or writes generated are contiguous scalars or vectors.
  */
 void
-lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
-					    ir_variable *base_offset,
-                                            unsigned int deref_offset,
-                                            bool row_major,
-                                            int matrix_columns)
+lower_ubo_reference_visitor::emit_access(bool is_write,
+                                         ir_dereference *deref,
+                                         ir_variable *base_offset,
+                                         unsigned int deref_offset,
+                                         bool row_major,
+                                         int matrix_columns,
+                                         unsigned write_mask)
 {
    if (deref->type->is_record()) {
       unsigned int field_offset = 0;
 
       for (unsigned i = 0; i < deref->type->length; i++) {
-	 const struct glsl_struct_field *field =
-	    &deref->type->fields.structure[i];
-	 ir_dereference *field_deref =
-	    new(mem_ctx) ir_dereference_record(deref->clone(mem_ctx, NULL),
-					       field->name);
+         const struct glsl_struct_field *field =
+            &deref->type->fields.structure[i];
+         ir_dereference *field_deref =
+            new(mem_ctx) ir_dereference_record(deref->clone(mem_ctx, NULL),
+                                               field->name);
 
-	 field_offset =
-	    glsl_align(field_offset,
+         field_offset =
+            glsl_align(field_offset,
                        field->type->std140_base_alignment(row_major));
 
-	 emit_ubo_loads(field_deref, base_offset, deref_offset + field_offset,
-                        row_major, 1);
+         emit_access(is_write, field_deref, base_offset,
+                     deref_offset + field_offset,
+                     row_major, 1,
+                     writemask_for_size(field_deref->type->vector_elements));
 
-	 field_offset += field->type->std140_size(row_major);
+         field_offset += field->type->std140_size(row_major);
       }
       return;
    }
 
    if (deref->type->is_array()) {
       unsigned array_stride =
-	 glsl_align(deref->type->fields.array->std140_size(row_major),
-		    16);
+         glsl_align(deref->type->fields.array->std140_size(row_major), 16);
 
       for (unsigned i = 0; i < deref->type->length; i++) {
-	 ir_constant *element = new(mem_ctx) ir_constant(i);
-	 ir_dereference *element_deref =
-	    new(mem_ctx) ir_dereference_array(deref->clone(mem_ctx, NULL),
-					      element);
-	 emit_ubo_loads(element_deref, base_offset,
-			deref_offset + i * array_stride,
-                        row_major, 1);
+         ir_constant *element = new(mem_ctx) ir_constant(i);
+         ir_dereference *element_deref =
+            new(mem_ctx) ir_dereference_array(deref->clone(mem_ctx, NULL),
+                                              element);
+         emit_access(is_write, element_deref, base_offset,
+                     deref_offset + i * array_stride,
+                     row_major, 1,
+                     writemask_for_size(element_deref->type->vector_elements));
       }
       return;
    }
 
    if (deref->type->is_matrix()) {
       for (unsigned i = 0; i < deref->type->matrix_columns; i++) {
-	 ir_constant *col = new(mem_ctx) ir_constant(i);
-	 ir_dereference *col_deref =
-	    new(mem_ctx) ir_dereference_array(deref->clone(mem_ctx, NULL),
-					      col);
+         ir_constant *col = new(mem_ctx) ir_constant(i);
+         ir_dereference *col_deref =
+            new(mem_ctx) ir_dereference_array(deref->clone(mem_ctx, NULL), col);
 
          if (row_major) {
             /* For a row-major matrix, the next column starts at the next
              * element.
              */
             int size_mul = deref->type->is_double() ? 8 : 4;
-            emit_ubo_loads(col_deref, base_offset, deref_offset + i * size_mul,
-                           row_major, deref->type->matrix_columns);
+            emit_access(is_write, col_deref, base_offset,
+                        deref_offset + i * size_mul,
+                        row_major, deref->type->matrix_columns,
+                        writemask_for_size(col_deref->type->vector_elements));
          } else {
             /* std140 always rounds the stride of arrays (and matrices) to a
              * vec4, so matrices are always 16 between columns/rows. With
@@ -495,21 +634,33 @@ lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
              */
             int size_mul = (deref->type->is_double() &&
                             deref->type->vector_elements > 2) ? 32 : 16;
-            emit_ubo_loads(col_deref, base_offset, deref_offset + i * size_mul,
-                           row_major, deref->type->matrix_columns);
+            emit_access(is_write, col_deref, base_offset,
+                        deref_offset + i * size_mul,
+                        row_major, deref->type->matrix_columns,
+                        writemask_for_size(col_deref->type->vector_elements));
          }
       }
       return;
    }
 
-   assert(deref->type->is_scalar() ||
-	  deref->type->is_vector());
+   assert(deref->type->is_scalar() || deref->type->is_vector());
 
    if (!row_major) {
-      ir_rvalue *offset = add(base_offset,
-			      new(mem_ctx) ir_constant(deref_offset));
-      base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
-				    ubo_load(deref->type, offset)));
+      ir_rvalue *offset =
+         add(base_offset, new(mem_ctx) ir_constant(deref_offset));
+      if (is_write)
+         base_ir->insert_after(ssbo_store(deref, offset, write_mask));
+      else {
+         if (!this->is_shader_storage) {
+             base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
+                                           ubo_load(deref->type, offset)));
+         } else {
+            ir_call *load_ssbo = ssbo_load(deref->type, offset);
+            base_ir->insert_before(load_ssbo);
+            ir_rvalue *value = load_ssbo->return_deref->as_rvalue()->clone(mem_ctx, NULL);
+            base_ir->insert_before(assign(deref->clone(mem_ctx, NULL), value));
+         }
+      }
    } else {
       unsigned N = deref->type->is_double() ? 8 : 4;
 
@@ -527,22 +678,109 @@ lower_ubo_reference_visitor::emit_ubo_loads(ir_dereference *deref,
       assert(matrix_columns <= 4);
       unsigned matrix_stride = glsl_align(matrix_columns * N, 16);
 
-      const glsl_type *ubo_type = deref->type->base_type == GLSL_TYPE_FLOAT ?
+      const glsl_type *deref_type = deref->type->base_type == GLSL_TYPE_FLOAT ?
          glsl_type::float_type : glsl_type::double_type;
 
       for (unsigned i = 0; i < deref->type->vector_elements; i++) {
-	 ir_rvalue *chan_offset =
-	    add(base_offset,
-		new(mem_ctx) ir_constant(deref_offset + i * matrix_stride));
-
-	 base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
-				       ubo_load(ubo_type,
-						chan_offset),
-				       (1U << i)));
+         ir_rvalue *chan_offset =
+            add(base_offset,
+                new(mem_ctx) ir_constant(deref_offset + i * matrix_stride));
+         if (is_write) {
+            base_ir->insert_after(ssbo_store(swizzle(deref, i, 1), chan_offset, 1));
+         } else {
+            if (!this->is_shader_storage) {
+               base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
+                                             ubo_load(deref_type, chan_offset),
+                                             (1U << i)));
+            } else {
+               ir_call *load_ssbo = ssbo_load(deref_type, chan_offset);
+               base_ir->insert_before(load_ssbo);
+               ir_rvalue *value = load_ssbo->return_deref->as_rvalue()->clone(mem_ctx, NULL);
+               base_ir->insert_before(assign(deref->clone(mem_ctx, NULL),
+                                             value,
+                                             (1U << i)));
+            }
+         }
       }
    }
 }
 
+void
+lower_ubo_reference_visitor::write_to_memory(ir_dereference *deref,
+                                             ir_variable *var,
+                                             ir_variable *write_var,
+                                             unsigned write_mask)
+{
+   ir_rvalue *offset = NULL;
+   unsigned const_offset;
+   bool row_major;
+   int matrix_columns;
+
+   /* Compute the offset to the start if the dereference as well as other
+    * information we need to configure the write
+    */
+   setup_for_load_or_store(var, deref,
+                           &offset, &const_offset,
+                           &row_major, &matrix_columns);
+   assert(offset);
+
+   /* Now emit writes from the temporary to memory */
+   ir_variable *write_offset =
+      new(mem_ctx) ir_variable(glsl_type::uint_type,
+                               "ssbo_store_temp_offset",
+                               ir_var_temporary);
+
+   base_ir->insert_before(write_offset);
+   base_ir->insert_before(assign(write_offset, offset));
+
+   deref = new(mem_ctx) ir_dereference_variable(write_var);
+   emit_access(true, deref, write_offset, const_offset,
+               row_major, matrix_columns, write_mask);
+}
+
+void
+lower_ubo_reference_visitor::check_for_ssbo_store(ir_assignment *ir)
+{
+   if (!ir || !ir->lhs)
+      return;
+
+   ir_rvalue *rvalue = ir->lhs->as_rvalue();
+   if (!rvalue)
+      return;
+
+   ir_dereference *deref = ir->lhs->as_dereference();
+   if (!deref)
+      return;
+
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (!var || !var->is_in_buffer_block())
+      return;
+
+   /* We have a write to a buffer variable, so declare a temporary and rewrite
+    * the assignment so that the temporary is the LHS.
+    */
+   mem_ctx = ralloc_parent(shader->ir);
+
+   const glsl_type *type = rvalue->type;
+   ir_variable *write_var = new(mem_ctx) ir_variable(type,
+                                                     "ssbo_store_temp",
+                                                     ir_var_temporary);
+   base_ir->insert_before(write_var);
+   ir->lhs = new(mem_ctx) ir_dereference_variable(write_var);
+
+   /* Now we have to write the value assigned to the temporary back to memory */
+   write_to_memory(deref, var, write_var, ir->write_mask);
+   progress = true;
+}
+
+
+ir_visitor_status
+lower_ubo_reference_visitor::visit_enter(ir_assignment *ir)
+{
+   check_for_ssbo_store(ir);
+   return rvalue_visit(ir);
+}
+
 } /* unnamed namespace */
 
 void
diff --git a/src/glsl/lower_variable_index_to_cond_assign.cpp b/src/glsl/lower_variable_index_to_cond_assign.cpp
index d878cb07811..1ab3afecc7e 100644
--- a/src/glsl/lower_variable_index_to_cond_assign.cpp
+++ b/src/glsl/lower_variable_index_to_cond_assign.cpp
@@ -335,12 +335,14 @@ struct switch_generator
 
 class variable_index_to_cond_assign_visitor : public ir_rvalue_visitor {
 public:
-   variable_index_to_cond_assign_visitor(bool lower_input,
-					 bool lower_output,
-					 bool lower_temp,
-					 bool lower_uniform)
+   variable_index_to_cond_assign_visitor(gl_shader_stage stage,
+                                         bool lower_input,
+                                         bool lower_output,
+                                         bool lower_temp,
+                                         bool lower_uniform)
    {
       this->progress = false;
+      this->stage = stage;
       this->lower_inputs = lower_input;
       this->lower_outputs = lower_output;
       this->lower_temps = lower_temp;
@@ -348,6 +350,8 @@ public:
    }
 
    bool progress;
+
+   gl_shader_stage stage;
    bool lower_inputs;
    bool lower_outputs;
    bool lower_temps;
@@ -369,17 +373,44 @@ public:
       case ir_var_auto:
       case ir_var_temporary:
 	 return this->lower_temps;
+
       case ir_var_uniform:
+      case ir_var_shader_storage:
 	 return this->lower_uniforms;
+
       case ir_var_function_in:
       case ir_var_const_in:
          return this->lower_temps;
+
       case ir_var_shader_in:
+         /* The input array size is unknown at compiler time for non-patch
+          * inputs in TCS and TES. The arrays are sized to
+          * the implementation-dependent limit "gl_MaxPatchVertices", but
+          * the real size is stored in the "gl_PatchVerticesIn" built-in
+          * uniform.
+          *
+          * The TCS input array size is specified by
+          * glPatchParameteri(GL_PATCH_VERTICES).
+          *
+          * The TES input array size is specified by the "vertices" output
+          * layout qualifier in TCS.
+          */
+         if ((stage == MESA_SHADER_TESS_CTRL ||
+              stage == MESA_SHADER_TESS_EVAL) && !var->data.patch)
+            return false;
          return this->lower_inputs;
+
       case ir_var_function_out:
+         /* TCS non-patch outputs can only be indexed with "gl_InvocationID".
+          * Other expressions are not allowed.
+          */
+         if (stage == MESA_SHADER_TESS_CTRL && !var->data.patch)
+            return false;
          return this->lower_temps;
+
       case ir_var_shader_out:
          return this->lower_outputs;
+
       case ir_var_function_inout:
 	 return this->lower_temps;
       }
@@ -522,16 +553,18 @@ public:
 } /* anonymous namespace */
 
 bool
-lower_variable_index_to_cond_assign(exec_list *instructions,
-				    bool lower_input,
-				    bool lower_output,
-				    bool lower_temp,
-				    bool lower_uniform)
+lower_variable_index_to_cond_assign(gl_shader_stage stage,
+                                    exec_list *instructions,
+                                    bool lower_input,
+                                    bool lower_output,
+                                    bool lower_temp,
+                                    bool lower_uniform)
 {
-   variable_index_to_cond_assign_visitor v(lower_input,
-					   lower_output,
-					   lower_temp,
-					   lower_uniform);
+   variable_index_to_cond_assign_visitor v(stage,
+                                           lower_input,
+                                           lower_output,
+                                           lower_temp,
+                                           lower_uniform);
 
    /* Continue lowering until no progress is made.  If there are multiple
     * levels of indirection (e.g., non-constant indexing of array elements and
diff --git a/src/glsl/main.cpp b/src/glsl/main.cpp
index 23412980dce..df93a013ede 100644
--- a/src/glsl/main.cpp
+++ b/src/glsl/main.cpp
@@ -204,6 +204,8 @@ initialize_context(struct gl_context *ctx, gl_api api)
    }
 
    ctx->Const.GenerateTemporaryNames = true;
+   ctx->Const.MaxPatchVertices = 32;
+
    ctx->Driver.NewShader = _mesa_new_shader;
 }
 
@@ -273,7 +275,7 @@ usage_fail(const char *name)
 {
 
    const char *header =
-      "usage: %s [options] <file.vert | file.geom | file.frag>\n"
+      "usage: %s [options] <file.vert | file.tesc | file.tese | file.geom | file.frag | file.comp>\n"
       "\n"
       "Possible options are:\n";
    printf(header, name);
@@ -373,6 +375,10 @@ main(int argc, char **argv)
       const char *const ext = & argv[optind][len - 5];
       if (strncmp(".vert", ext, 5) == 0 || strncmp(".glsl", ext, 5) == 0)
 	 shader->Type = GL_VERTEX_SHADER;
+      else if (strncmp(".tesc", ext, 5) == 0)
+	 shader->Type = GL_TESS_CONTROL_SHADER;
+      else if (strncmp(".tese", ext, 5) == 0)
+	 shader->Type = GL_TESS_EVALUATION_SHADER;
       else if (strncmp(".geom", ext, 5) == 0)
 	 shader->Type = GL_GEOMETRY_SHADER;
       else if (strncmp(".frag", ext, 5) == 0)
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 54e56145c89..27dabd3b8f2 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -284,6 +284,9 @@ nir_visitor::visit(ir_variable *ir)
       var->data.mode = nir_var_uniform;
       break;
 
+   case ir_var_shader_storage:
+      var->data.mode = nir_var_shader_storage;
+      break;
 
    case ir_var_system_value:
       var->data.mode = nir_var_system_value;
@@ -376,6 +379,7 @@ nir_visitor::visit(ir_variable *ir)
       break;
 
    case nir_var_uniform:
+   case nir_var_shader_storage:
       exec_list_push_tail(&shader->uniforms, &var->node);
       break;
 
@@ -1182,6 +1186,7 @@ nir_visitor::visit(ir_expression *ir)
    case ir_unop_bitcast_f2i:
    case ir_unop_bitcast_u2f:
    case ir_unop_bitcast_f2u:
+   case ir_unop_subroutine_to_int:
       /* no-op */
       emit(nir_op_imov, dest_size, srcs);
       break;
diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index f03e80a4e0e..2f7cbae42be 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -57,7 +57,6 @@ reg_create(void *mem_ctx, struct exec_list *list)
 {
    nir_register *reg = ralloc(mem_ctx, nir_register);
 
-   reg->parent_instr = NULL;
    list_inithead(&reg->uses);
    list_inithead(&reg->defs);
    list_inithead(&reg->if_uses);
@@ -148,18 +147,18 @@ void nir_src_copy(nir_src *dest, const nir_src *src, void *mem_ctx)
 
 void nir_dest_copy(nir_dest *dest, const nir_dest *src, void *mem_ctx)
 {
-   dest->is_ssa = src->is_ssa;
-   if (src->is_ssa) {
-      dest->ssa = src->ssa;
+   /* Copying an SSA definition makes no sense whatsoever. */
+   assert(!src->is_ssa);
+
+   dest->is_ssa = false;
+
+   dest->reg.base_offset = src->reg.base_offset;
+   dest->reg.reg = src->reg.reg;
+   if (src->reg.indirect) {
+      dest->reg.indirect = ralloc(mem_ctx, nir_src);
+      nir_src_copy(dest->reg.indirect, src->reg.indirect, mem_ctx);
    } else {
-      dest->reg.base_offset = src->reg.base_offset;
-      dest->reg.reg = src->reg.reg;
-      if (src->reg.indirect) {
-         dest->reg.indirect = ralloc(mem_ctx, nir_src);
-         nir_src_copy(dest->reg.indirect, src->reg.indirect, mem_ctx);
-      } else {
-         dest->reg.indirect = NULL;
-      }
+      dest->reg.indirect = NULL;
    }
 }
 
@@ -451,7 +450,7 @@ nir_call_instr_create(nir_shader *shader, nir_function_overload *callee)
 nir_tex_instr *
 nir_tex_instr_create(nir_shader *shader, unsigned num_srcs)
 {
-   nir_tex_instr *instr = ralloc(shader, nir_tex_instr);
+   nir_tex_instr *instr = rzalloc(shader, nir_tex_instr);
    instr_init(&instr->instr, nir_instr_type_tex);
 
    dest_init(&instr->dest);
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 7a088c44e8b..70af06e6971 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -87,6 +87,7 @@ typedef enum {
    nir_var_global,
    nir_var_local,
    nir_var_uniform,
+   nir_var_shader_storage,
    nir_var_system_value
 } nir_variable_mode;
 
@@ -390,14 +391,6 @@ typedef struct {
     */
    bool is_packed;
 
-   /**
-    * If this pointer is non-NULL then this register has exactly one
-    * definition and that definition dominates all of its uses.  This is
-    * set by the out-of-SSA pass so that backends can get SSA-like
-    * information even once they have gone out of SSA.
-    */
-   struct nir_instr *parent_instr;
-
    /** set of nir_instr's where this register is used (read from) */
    struct list_head uses;
 
@@ -451,6 +444,18 @@ nir_instr_prev(nir_instr *instr)
       return exec_node_data(nir_instr, prev, node);
 }
 
+static inline bool
+nir_instr_is_first(nir_instr *instr)
+{
+   return exec_node_is_head_sentinel(exec_node_get_prev(&instr->node));
+}
+
+static inline bool
+nir_instr_is_last(nir_instr *instr)
+{
+   return exec_node_is_tail_sentinel(exec_node_get_next(&instr->node));
+}
+
 typedef struct {
    /** for debugging only, can be NULL */
    const char* name;
@@ -574,16 +579,6 @@ nir_src_for_reg(nir_register *reg)
    return src;
 }
 
-static inline nir_instr *
-nir_src_get_parent_instr(const nir_src *src)
-{
-   if (src->is_ssa) {
-      return src->ssa->parent_instr;
-   } else {
-      return src->reg.reg->parent_instr;
-   }
-}
-
 static inline nir_dest
 nir_dest_for_reg(nir_register *reg)
 {
@@ -1259,6 +1254,8 @@ nir_block_last_instr(nir_block *block)
    foreach_list_typed_reverse(nir_instr, instr, node, &(block)->instr_list)
 #define nir_foreach_instr_safe(block, instr) \
    foreach_list_typed_safe(nir_instr, instr, node, &(block)->instr_list)
+#define nir_foreach_instr_safe_reverse(block, instr) \
+   foreach_list_typed_safe_reverse(nir_instr, instr, node, &(block)->instr_list)
 
 typedef struct nir_if {
    nir_cf_node cf_node;
@@ -1661,14 +1658,16 @@ void nir_lower_global_vars_to_local(nir_shader *shader);
 
 void nir_lower_locals_to_regs(nir_shader *shader);
 
-void nir_assign_var_locations_scalar(struct exec_list *var_list,
-                                     unsigned *size);
-void nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
-                                                  struct exec_list *var_list,
-                                                  unsigned *direct_size,
-                                                  unsigned *size);
+void nir_assign_var_locations(struct exec_list *var_list,
+                              unsigned *size,
+                              bool is_scalar);
+void nir_assign_var_locations_direct_first(nir_shader *shader,
+                                           struct exec_list *var_list,
+                                           unsigned *direct_size,
+                                           unsigned *size,
+                                           bool is_scalar);
 
-void nir_lower_io(nir_shader *shader);
+void nir_lower_io(nir_shader *shader, bool is_scalar);
 
 void nir_lower_vars_to_ssa(nir_shader *shader);
 
@@ -1676,6 +1675,7 @@ void nir_remove_dead_variables(nir_shader *shader);
 
 void nir_lower_vec_to_movs(nir_shader *shader);
 void nir_lower_alu_to_scalar(nir_shader *shader);
+void nir_lower_load_const_to_scalar(nir_shader *shader);
 
 void nir_lower_phis_to_scalar(nir_shader *shader);
 
@@ -1698,7 +1698,12 @@ bool nir_ssa_defs_interfere(nir_ssa_def *a, nir_ssa_def *b);
 
 void nir_convert_to_ssa_impl(nir_function_impl *impl);
 void nir_convert_to_ssa(nir_shader *shader);
-void nir_convert_from_ssa(nir_shader *shader);
+
+/* If phi_webs_only is true, only convert SSA values involved in phi nodes to
+ * registers.  If false, convert all values (even those not involved in a phi
+ * node) to registers.
+ */
+void nir_convert_from_ssa(nir_shader *shader, bool phi_webs_only);
 
 bool nir_opt_algebraic(nir_shader *shader);
 bool nir_opt_algebraic_late(nir_shader *shader);
@@ -1721,6 +1726,8 @@ bool nir_opt_peephole_ffma(nir_shader *shader);
 
 bool nir_opt_remove_phis(nir_shader *shader);
 
+bool nir_opt_undef(nir_shader *shader);
+
 void nir_sweep(nir_shader *shader);
 
 #ifdef __cplusplus
diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py
index bf82fe533d6..8fd9b1039a7 100644
--- a/src/glsl/nir/nir_constant_expressions.py
+++ b/src/glsl/nir/nir_constant_expressions.py
@@ -31,12 +31,7 @@ template = """\
 #include "util/rounding.h" /* for _mesa_roundeven */
 #include "nir_constant_expressions.h"
 
-#if defined(_MSC_VER) && (_MSC_VER < 1800)
-static int isnormal(double x)
-{
-   return _fpclass(x) == _FPCLASS_NN || _fpclass(x) == _FPCLASS_PN;
-}
-#elif defined(__SUNPRO_CC)
+#if defined(__SUNPRO_CC)
 #include <ieeefp.h>
 static int isnormal(double x)
 {
@@ -44,13 +39,6 @@ static int isnormal(double x)
 }
 #endif
 
-#if defined(_MSC_VER)
-static double copysign(double x, double y)
-{
-   return _copysign(x, y);
-}
-#endif
-
 /**
  * Evaluate one component of packSnorm4x8.
  */
diff --git a/src/glsl/nir/nir_from_ssa.c b/src/glsl/nir/nir_from_ssa.c
index 67733e6da4f..1fd8b24d33d 100644
--- a/src/glsl/nir/nir_from_ssa.c
+++ b/src/glsl/nir/nir_from_ssa.c
@@ -37,6 +37,7 @@
 struct from_ssa_state {
    void *mem_ctx;
    void *dead_ctx;
+   bool phi_webs_only;
    struct hash_table *merge_node_table;
    nir_instr *instr;
    nir_function_impl *impl;
@@ -482,6 +483,9 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
 
       reg = node->set->reg;
    } else {
+      if (state->phi_webs_only)
+         return true;
+
       /* We leave load_const SSA values alone.  They act as immediates to
        * the backend.  If it got coalesced into a phi, that's ok.
        */
@@ -492,21 +496,20 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
       reg->name = def->name;
       reg->num_components = def->num_components;
       reg->num_array_elems = 0;
-
-      /* This register comes from an SSA definition that is defined and not
-       * part of a phi-web.  Therefore, we know it has a single unique
-       * definition that dominates all of its uses; we can copy the
-       * parent_instr from the SSA def safely.
-       */
-      if (def->parent_instr->type != nir_instr_type_ssa_undef)
-         reg->parent_instr = def->parent_instr;
    }
 
    nir_ssa_def_rewrite_uses(def, nir_src_for_reg(reg), state->mem_ctx);
    assert(list_empty(&def->uses) && list_empty(&def->if_uses));
 
-   if (def->parent_instr->type == nir_instr_type_ssa_undef)
+   if (def->parent_instr->type == nir_instr_type_ssa_undef) {
+      /* If it's an ssa_undef instruction, remove it since we know we just got
+       * rid of all its uses.
+       */
+      nir_instr *parent_instr = def->parent_instr;
+      nir_instr_remove(parent_instr);
+      ralloc_steal(state->dead_ctx, parent_instr);
       return true;
+   }
 
    assert(def->parent_instr->type != nir_instr_type_load_const);
 
@@ -523,7 +526,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
 }
 
 /* Resolves ssa definitions to registers.  While we're at it, we also
- * remove phi nodes and ssa_undef instructions
+ * remove phi nodes.
  */
 static bool
 resolve_registers_block(nir_block *block, void *void_state)
@@ -534,8 +537,7 @@ resolve_registers_block(nir_block *block, void *void_state)
       state->instr = instr;
       nir_foreach_ssa_def(instr, rewrite_ssa_def, state);
 
-      if (instr->type == nir_instr_type_ssa_undef ||
-          instr->type == nir_instr_type_phi) {
+      if (instr->type == nir_instr_type_phi) {
          nir_instr_remove(instr);
          ralloc_steal(state->dead_ctx, instr);
       }
@@ -765,13 +767,14 @@ resolve_parallel_copies_block(nir_block *block, void *void_state)
 }
 
 static void
-nir_convert_from_ssa_impl(nir_function_impl *impl)
+nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
 {
    struct from_ssa_state state;
 
    state.mem_ctx = ralloc_parent(impl);
    state.dead_ctx = ralloc_context(NULL);
    state.impl = impl;
+   state.phi_webs_only = phi_webs_only;
    state.merge_node_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                                     _mesa_key_pointer_equal);
 
@@ -801,10 +804,10 @@ nir_convert_from_ssa_impl(nir_function_impl *impl)
 }
 
 void
-nir_convert_from_ssa(nir_shader *shader)
+nir_convert_from_ssa(nir_shader *shader, bool phi_webs_only)
 {
    nir_foreach_overload(shader, overload) {
       if (overload->impl)
-         nir_convert_from_ssa_impl(overload->impl);
+         nir_convert_from_ssa_impl(overload->impl, phi_webs_only);
    }
 }
diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c
index 25bba4ef0b6..efbe9e7175f 100644
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@@ -100,6 +100,21 @@ lower_alu_instr_scalar(nir_alu_instr *instr, void *mem_ctx)
        */
       return;
 
+   case nir_op_unpack_unorm_4x8:
+   case nir_op_unpack_snorm_4x8:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_unpack_snorm_2x16:
+      /* There is no scalar version of these ops, unless we were to break it
+       * down to bitshifts and math (which is definitely not intended).
+       */
+      return;
+
+   case nir_op_unpack_half_2x16:
+      /* We could split this into unpack_half_2x16_split_[xy], but should
+       * we?
+       */
+      return;
+
       LOWER_REDUCTION(nir_op_fdot, nir_op_fmul, nir_op_fadd);
       LOWER_REDUCTION(nir_op_ball_fequal, nir_op_feq, nir_op_iand);
       LOWER_REDUCTION(nir_op_ball_iequal, nir_op_ieq, nir_op_iand);
@@ -164,7 +179,7 @@ lower_alu_to_scalar_block(nir_block *block, void *data)
 {
    nir_foreach_instr_safe(block, instr) {
       if (instr->type == nir_instr_type_alu)
-         lower_alu_instr_scalar((nir_alu_instr *)instr, data);
+         lower_alu_instr_scalar(nir_instr_as_alu(instr), data);
    }
 
    return true;
diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c
index 0457de60d9a..ce3615a3aa1 100644
--- a/src/glsl/nir/nir_lower_atomics.c
+++ b/src/glsl/nir/nir_lower_atomics.c
@@ -55,7 +55,8 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl)
       return;
    }
 
-   if (instr->variables[0]->var->data.mode != nir_var_uniform)
+   if (instr->variables[0]->var->data.mode != nir_var_uniform &&
+       instr->variables[0]->var->data.mode != nir_var_shader_storage)
       return; /* atomics passed as function arguments can't be lowered */
 
    void *mem_ctx = ralloc_parent(instr);
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index 4c59298ecb7..6a4494d5fd2 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -29,19 +29,58 @@
 /*
  * This lowering pass converts references to input/output variables with
  * loads/stores to actual input/output intrinsics.
- *
- * NOTE: This pass really only works for scalar backends at the moment due
- * to the way it packes the input/output data.
  */
 
 #include "nir.h"
 
 struct lower_io_state {
    void *mem_ctx;
+   bool is_scalar;
 };
 
+static int
+type_size_vec4(const struct glsl_type *type)
+{
+   unsigned int i;
+   int size;
+
+   switch (glsl_get_base_type(type)) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+      if (glsl_type_is_matrix(type)) {
+         return glsl_get_matrix_columns(type);
+      } else {
+         return 1;
+      }
+   case GLSL_TYPE_ARRAY:
+      return type_size_vec4(glsl_get_array_element(type)) * glsl_get_length(type);
+   case GLSL_TYPE_STRUCT:
+      size = 0;
+      for (i = 0; i <  glsl_get_length(type); i++) {
+         size += type_size_vec4(glsl_get_struct_field(type, i));
+      }
+      return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
+   case GLSL_TYPE_SAMPLER:
+      return 0;
+   case GLSL_TYPE_ATOMIC_UINT:
+      return 0;
+   case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_INTERFACE:
+      unreachable("not reached");
+   }
+
+   return 0;
+}
+
 static unsigned
-type_size(const struct glsl_type *type)
+type_size_scalar(const struct glsl_type *type)
 {
    unsigned int size, i;
 
@@ -52,13 +91,15 @@ type_size(const struct glsl_type *type)
    case GLSL_TYPE_BOOL:
       return glsl_get_components(type);
    case GLSL_TYPE_ARRAY:
-      return type_size(glsl_get_array_element(type)) * glsl_get_length(type);
+      return type_size_scalar(glsl_get_array_element(type)) * glsl_get_length(type);
    case GLSL_TYPE_STRUCT:
       size = 0;
       for (i = 0; i < glsl_get_length(type); i++) {
-         size += type_size(glsl_get_struct_field(type, i));
+         size += type_size_scalar(glsl_get_struct_field(type, i));
       }
       return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
    case GLSL_TYPE_SAMPLER:
       return 0;
    case GLSL_TYPE_ATOMIC_UINT:
@@ -77,8 +118,17 @@ type_size(const struct glsl_type *type)
    return 0;
 }
 
+static unsigned
+type_size(const struct glsl_type *type, bool is_scalar)
+{
+   if (is_scalar)
+      return type_size_scalar(type);
+   else
+      return type_size_vec4(type);
+}
+
 void
-nir_assign_var_locations_scalar(struct exec_list *var_list, unsigned *size)
+nir_assign_var_locations(struct exec_list *var_list, unsigned *size, bool is_scalar)
 {
    unsigned location = 0;
 
@@ -87,11 +137,12 @@ nir_assign_var_locations_scalar(struct exec_list *var_list, unsigned *size)
        * UBO's have their own address spaces, so don't count them towards the
        * number of global uniforms
        */
-      if (var->data.mode == nir_var_uniform && var->interface_type != NULL)
+      if ((var->data.mode == nir_var_uniform || var->data.mode == nir_var_shader_storage) &&
+          var->interface_type != NULL)
          continue;
 
       var->data.driver_location = location;
-      location += type_size(var->type);
+      location += type_size(var->type, is_scalar);
    }
 
    *size = location;
@@ -137,10 +188,11 @@ mark_indirect_uses_block(nir_block *block, void *void_state)
  * assigns locations to variables that are used indirectly.
  */
 void
-nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
-                                             struct exec_list *var_list,
-                                             unsigned *direct_size,
-                                             unsigned *size)
+nir_assign_var_locations_direct_first(nir_shader *shader,
+                                      struct exec_list *var_list,
+                                      unsigned *direct_size,
+                                      unsigned *size,
+                                      bool is_scalar)
 {
    struct set *indirect_set = _mesa_set_create(NULL, _mesa_hash_pointer,
                                                _mesa_key_pointer_equal);
@@ -154,27 +206,29 @@ nir_assign_var_locations_scalar_direct_first(nir_shader *shader,
    unsigned location = 0;
 
    foreach_list_typed(nir_variable, var, node, var_list) {
-      if (var->data.mode == nir_var_uniform && var->interface_type != NULL)
+      if ((var->data.mode == nir_var_uniform || var->data.mode == nir_var_shader_storage) &&
+          var->interface_type != NULL)
          continue;
 
       if (_mesa_set_search(indirect_set, var))
          continue;
 
       var->data.driver_location = location;
-      location += type_size(var->type);
+      location += type_size(var->type, is_scalar);
    }
 
    *direct_size = location;
 
    foreach_list_typed(nir_variable, var, node, var_list) {
-      if (var->data.mode == nir_var_uniform && var->interface_type != NULL)
+      if ((var->data.mode == nir_var_uniform || var->data.mode == nir_var_shader_storage) &&
+          var->interface_type != NULL)
          continue;
 
       if (!_mesa_set_search(indirect_set, var))
          continue;
 
       var->data.driver_location = location;
-      location += type_size(var->type);
+      location += type_size(var->type, is_scalar);
    }
 
    *size = location;
@@ -196,7 +250,7 @@ get_io_offset(nir_deref_var *deref, nir_instr *instr, nir_src *indirect,
 
       if (tail->deref_type == nir_deref_type_array) {
          nir_deref_array *deref_array = nir_deref_as_array(tail);
-         unsigned size = type_size(tail->type);
+         unsigned size = type_size(tail->type, state->is_scalar);
 
          base_offset += size * deref_array->base_offset;
 
@@ -238,7 +292,8 @@ get_io_offset(nir_deref_var *deref, nir_instr *instr, nir_src *indirect,
          nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
 
          for (unsigned i = 0; i < deref_struct->index; i++)
-            base_offset += type_size(glsl_get_struct_field(parent_type, i));
+            base_offset += type_size(glsl_get_struct_field(parent_type, i),
+                                     state->is_scalar);
       }
    }
 
@@ -351,11 +406,12 @@ nir_lower_io_block(nir_block *block, void *void_state)
 }
 
 static void
-nir_lower_io_impl(nir_function_impl *impl)
+nir_lower_io_impl(nir_function_impl *impl, bool is_scalar)
 {
    struct lower_io_state state;
 
    state.mem_ctx = ralloc_parent(impl);
+   state.is_scalar = is_scalar;
 
    nir_foreach_block(impl, nir_lower_io_block, &state);
 
@@ -364,10 +420,10 @@ nir_lower_io_impl(nir_function_impl *impl)
 }
 
 void
-nir_lower_io(nir_shader *shader)
+nir_lower_io(nir_shader *shader, bool is_scalar)
 {
    nir_foreach_overload(shader, overload) {
       if (overload->impl)
-         nir_lower_io_impl(overload->impl);
+         nir_lower_io_impl(overload->impl, is_scalar);
    }
 }
diff --git a/src/glsl/nir/nir_lower_load_const_to_scalar.c b/src/glsl/nir/nir_lower_load_const_to_scalar.c
new file mode 100644
index 00000000000..a90e5245898
--- /dev/null
+++ b/src/glsl/nir/nir_lower_load_const_to_scalar.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/macros.h"
+#include "nir.h"
+#include "nir_builder.h"
+
+/** @file nir_lower_load_const_to_scalar.c
+ *
+ * Replaces vector nir_load_const instructions with a series of loads and a
+ * vec[234] to reconstruct the original vector (on the assumption that
+ * nir_lower_alu_to_scalar() will then be used to split it up).
+ *
+ * This gives NIR a chance to CSE more operations on a scalar shader, when the
+ * same value was used in different vector contant loads.
+ */
+
+static void
+lower_load_const_instr_scalar(nir_load_const_instr *lower)
+{
+   if (lower->def.num_components == 1)
+      return;
+
+   nir_builder b;
+   nir_builder_init(&b, nir_cf_node_get_function(&lower->instr.block->cf_node));
+   nir_builder_insert_before_instr(&b, &lower->instr);
+
+   /* Emit the individual loads. */
+   nir_ssa_def *loads[4];
+   for (unsigned i = 0; i < lower->def.num_components; i++) {
+      nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1);
+      load_comp->value.u[0] = lower->value.u[i];
+      nir_builder_instr_insert(&b, &load_comp->instr);
+      loads[i] = &load_comp->def;
+   }
+
+   /* Batch things back together into a vector. */
+   nir_ssa_def *vec;
+   switch (lower->def.num_components) {
+   case 2:
+      vec = nir_vec2(&b, loads[0], loads[1]);
+      break;
+   case 3:
+      vec = nir_vec3(&b, loads[0], loads[1], loads[2]);
+      break;
+   case 4:
+      vec = nir_vec4(&b, loads[0], loads[1], loads[2], loads[3]);
+      break;
+   default:
+      unreachable("Unknown load_const component count.");
+   }
+
+   /* Replace the old load with a reference to our reconstructed vector. */
+   nir_ssa_def_rewrite_uses(&lower->def, nir_src_for_ssa(vec),
+                            ralloc_parent(b.impl));
+   nir_instr_remove(&lower->instr);
+}
+
+static bool
+lower_load_const_to_scalar_block(nir_block *block, void *data)
+{
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type == nir_instr_type_load_const)
+         lower_load_const_instr_scalar(nir_instr_as_load_const(instr));
+   }
+
+   return true;
+}
+
+static void
+nir_lower_load_const_to_scalar_impl(nir_function_impl *impl)
+{
+   nir_foreach_block(impl, lower_load_const_to_scalar_block, NULL);
+}
+
+void
+nir_lower_load_const_to_scalar(nir_shader *shader)
+{
+   nir_foreach_overload(shader, overload) {
+      if (overload->impl)
+         nir_lower_load_const_to_scalar_impl(overload->impl);
+   }
+}
diff --git a/src/glsl/nir/nir_lower_phis_to_scalar.c b/src/glsl/nir/nir_lower_phis_to_scalar.c
index a57d253975d..739170d61fd 100644
--- a/src/glsl/nir/nir_lower_phis_to_scalar.c
+++ b/src/glsl/nir/nir_lower_phis_to_scalar.c
@@ -75,6 +75,7 @@ is_phi_src_scalarizable(nir_phi_src *src,
       return should_lower_phi(nir_instr_as_phi(src_instr), state);
 
    case nir_instr_type_load_const:
+   case nir_instr_type_ssa_undef:
       /* These are trivially scalarizable */
       return true;
 
diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
index 602853ea665..e6d522f88ce 100644
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -90,7 +90,7 @@ lower_vec_to_movs_block(nir_block *block, void *mem_ctx)
       if (instr->type != nir_instr_type_alu)
          continue;
 
-      nir_alu_instr *vec = (nir_alu_instr *)instr;
+      nir_alu_instr *vec = nir_instr_as_alu(instr);
 
       switch (vec->op) {
       case nir_op_vec2:
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 56e96d9121e..df5b7e2d517 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -474,10 +474,10 @@ else
 """)
 
 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
-dst = ldexp(src0, src1);
+dst = ldexpf(src0, src1);
 /* flush denormals to zero. */
 if (!isnormal(dst))
-   dst = copysign(0.0f, src0);
+   dst = copysignf(0.0f, src0);
 """)
 
 # Combines the first component of each input to make a 2-component vector.
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index eace791f5b0..d7c17403f9f 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -101,6 +101,7 @@ optimizations = [
    (('umin', a, a), a),
    (('umax', a, a), a),
    (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
+   (('fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
    (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
    (('fsat', ('fsat', a)), ('fsat', a)),
    (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)),
@@ -131,6 +132,7 @@ optimizations = [
    # Logical and bit operations
    (('fand', a, 0.0), 0.0),
    (('iand', a, a), a),
+   (('iand', a, ~0), a),
    (('iand', a, 0), 0),
    (('ior', a, a), a),
    (('ior', a, 0), a),
diff --git a/src/glsl/nir/nir_opt_cse.c b/src/glsl/nir/nir_opt_cse.c
index 553906e1291..864795ce5ed 100644
--- a/src/glsl/nir/nir_opt_cse.c
+++ b/src/glsl/nir/nir_opt_cse.c
@@ -86,8 +86,41 @@ nir_instrs_equal(nir_instr *instr1, nir_instr *instr2)
       }
       return true;
    }
-   case nir_instr_type_tex:
-      return false;
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex1 = nir_instr_as_tex(instr1);
+      nir_tex_instr *tex2 = nir_instr_as_tex(instr2);
+
+      if (tex1->op != tex2->op)
+         return false;
+
+      if (tex1->num_srcs != tex2->num_srcs)
+         return false;
+      for (unsigned i = 0; i < tex1->num_srcs; i++) {
+         if (tex1->src[i].src_type != tex2->src[i].src_type ||
+             !nir_srcs_equal(tex1->src[i].src, tex2->src[i].src)) {
+            return false;
+         }
+      }
+
+      if (tex1->coord_components != tex2->coord_components ||
+          tex1->sampler_dim != tex2->sampler_dim ||
+          tex1->is_array != tex2->is_array ||
+          tex1->is_shadow != tex2->is_shadow ||
+          tex1->is_new_style_shadow != tex2->is_new_style_shadow ||
+          memcmp(tex1->const_offset, tex2->const_offset,
+                 sizeof(tex1->const_offset)) != 0 ||
+          tex1->component != tex2->component ||
+         tex1->sampler_index != tex2->sampler_index ||
+         tex1->sampler_array_size != tex2->sampler_array_size) {
+         return false;
+      }
+
+      /* Don't support un-lowered sampler derefs currently. */
+      if (tex1->sampler || tex2->sampler)
+         return false;
+
+      return true;
+   }
    case nir_instr_type_load_const: {
       nir_load_const_instr *load1 = nir_instr_as_load_const(instr1);
       nir_load_const_instr *load2 = nir_instr_as_load_const(instr2);
@@ -181,11 +214,10 @@ nir_instr_can_cse(nir_instr *instr)
 
    switch (instr->type) {
    case nir_instr_type_alu:
+   case nir_instr_type_tex:
    case nir_instr_type_load_const:
    case nir_instr_type_phi:
       return true;
-   case nir_instr_type_tex:
-      return false; /* TODO */
    case nir_instr_type_intrinsic: {
       const nir_intrinsic_info *info =
          &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic];
@@ -212,6 +244,9 @@ nir_instr_get_dest_ssa_def(nir_instr *instr)
    case nir_instr_type_alu:
       assert(nir_instr_as_alu(instr)->dest.dest.is_ssa);
       return &nir_instr_as_alu(instr)->dest.dest.ssa;
+   case nir_instr_type_tex:
+      assert(nir_instr_as_tex(instr)->dest.is_ssa);
+      return &nir_instr_as_tex(instr)->dest.ssa;
    case nir_instr_type_load_const:
       return &nir_instr_as_load_const(instr)->def;
    case nir_instr_type_phi:
diff --git a/src/glsl/nir/nir_opt_peephole_ffma.c b/src/glsl/nir/nir_opt_peephole_ffma.c
index 798506b7595..a823adbb465 100644
--- a/src/glsl/nir/nir_opt_peephole_ffma.c
+++ b/src/glsl/nir/nir_opt_peephole_ffma.c
@@ -76,6 +76,7 @@ static nir_alu_instr *
 get_mul_for_src(nir_alu_src *src, int num_components,
                 uint8_t swizzle[4], bool *negate, bool *abs)
 {
+   uint8_t swizzle_tmp[4];
    assert(src->src.is_ssa && !src->abs && !src->negate);
 
    nir_instr *instr = src->src.ssa->parent_instr;
@@ -116,8 +117,18 @@ get_mul_for_src(nir_alu_src *src, int num_components,
    if (!alu)
       return NULL;
 
+   /* Copy swizzle data before overwriting it to avoid setting a wrong swizzle.
+    *
+    * Example:
+    *   Former swizzle[] = xyzw
+    *   src->swizzle[] = zyxx
+    *
+    *   Expected output swizzle = zyxx
+    *   If we reuse swizzle in the loop, then output swizzle would be zyzz.
+    */
+   memcpy(swizzle_tmp, swizzle, 4*sizeof(uint8_t));
    for (unsigned i = 0; i < num_components; i++)
-      swizzle[i] = swizzle[src->swizzle[i]];
+      swizzle[i] = swizzle_tmp[src->swizzle[i]];
 
    return alu;
 }
diff --git a/src/glsl/nir/nir_opt_peephole_select.c b/src/glsl/nir/nir_opt_peephole_select.c
index ef7c9775aa3..6620e5dc81f 100644
--- a/src/glsl/nir/nir_opt_peephole_select.c
+++ b/src/glsl/nir/nir_opt_peephole_select.c
@@ -82,14 +82,22 @@ block_check_for_allowed_instrs(nir_block *block)
          break;
 
       case nir_instr_type_alu: {
-         /* It must be a move operation */
          nir_alu_instr *mov = nir_instr_as_alu(instr);
-         if (mov->op != nir_op_fmov && mov->op != nir_op_imov &&
-             mov->op != nir_op_fneg && mov->op != nir_op_ineg &&
-             mov->op != nir_op_fabs && mov->op != nir_op_iabs &&
-             mov->op != nir_op_vec2 && mov->op != nir_op_vec3 &&
-             mov->op != nir_op_vec4)
+         switch (mov->op) {
+         case nir_op_fmov:
+         case nir_op_imov:
+         case nir_op_fneg:
+         case nir_op_ineg:
+         case nir_op_fabs:
+         case nir_op_iabs:
+         case nir_op_vec2:
+         case nir_op_vec3:
+         case nir_op_vec4:
+            /* It must be a move-like operation. */
+            break;
+         default:
             return false;
+         }
 
          /* Can't handle saturate */
          if (mov->dest.saturate)
diff --git a/src/glsl/nir/nir_opt_undef.c b/src/glsl/nir/nir_opt_undef.c
new file mode 100644
index 00000000000..4ab27a8c9d5
--- /dev/null
+++ b/src/glsl/nir/nir_opt_undef.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+
+/** @file nir_opt_undef.c
+ *
+ * Handles optimization of operations involving ssa_undef.  For now, we just
+ * make sure that csels between undef and some other value just give the other
+ * value (on the assumption that the condition's going to be choosing the
+ * defined value).  This reduces work after if flattening when each side of
+ * the if is defining a variable.
+ *
+ * Some day, we may find some use for making other operations consuming an
+ * undef arg output undef, but I don't know of any cases currently.
+ */
+
+static bool
+opt_undef_alu(nir_alu_instr *instr)
+{
+   if (instr->op != nir_op_bcsel && instr->op != nir_op_fcsel)
+      return false;
+
+   assert(instr->dest.dest.is_ssa);
+
+   for (int i = 1; i <= 2; i++) {
+      if (!instr->src[i].src.is_ssa)
+         continue;
+
+      nir_instr *parent = instr->src[i].src.ssa->parent_instr;
+      if (parent->type != nir_instr_type_ssa_undef)
+         continue;
+
+      /* We can't just use nir_alu_src_copy, because we need the def/use
+       * updated.
+       */
+      nir_instr_rewrite_src(&instr->instr, &instr->src[0].src,
+                            instr->src[i == 1 ? 2 : 1].src);
+      nir_alu_src_copy(&instr->src[0], &instr->src[i == 1 ? 2 : 1],
+                       ralloc_parent(instr));
+
+      nir_src empty_src;
+      memset(&empty_src, 0, sizeof(empty_src));
+      nir_instr_rewrite_src(&instr->instr, &instr->src[1].src, empty_src);
+      nir_instr_rewrite_src(&instr->instr, &instr->src[2].src, empty_src);
+      instr->op = nir_op_imov;
+
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+opt_undef_block(nir_block *block, void *data)
+{
+   bool *progress = data;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type == nir_instr_type_alu)
+         if (opt_undef_alu(nir_instr_as_alu(instr)))
+             (*progress) = true;
+   }
+
+   return true;
+}
+
+bool
+nir_opt_undef(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_overload(shader, overload) {
+      if (overload->impl) {
+         nir_foreach_block(overload->impl, opt_undef_block, &progress);
+         if (progress)
+            nir_metadata_preserve(overload->impl,
+                                  nir_metadata_block_index |
+                                  nir_metadata_dominance);
+      }
+   }
+
+   return progress;
+}
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index eb4045cec8e..f591c4b5f8d 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -214,7 +214,7 @@ print_var_decl(nir_variable *var, print_var_state *state, FILE *fp)
    const char *const samp = (var->data.sample) ? "sample " : "";
    const char *const inv = (var->data.invariant) ? "invariant " : "";
    const char *const mode[] = { "shader_in ", "shader_out ", "", "",
-                                "uniform ", "system " };
+                                "uniform ", "shader_storage", "system " };
    const char *const interp[] = { "", "smooth", "flat", "noperspective" };
 
    fprintf(fp, "%s%s%s%s%s ",
@@ -239,7 +239,8 @@ print_var_decl(nir_variable *var, print_var_state *state, FILE *fp)
 
    if (var->data.mode == nir_var_shader_in ||
        var->data.mode == nir_var_shader_out ||
-       var->data.mode == nir_var_uniform) {
+       var->data.mode == nir_var_uniform ||
+       var->data.mode == nir_var_shader_storage) {
       fprintf(fp, " (%u, %u)", var->data.location, var->data.driver_location);
    }
 
diff --git a/src/glsl/nir/nir_search.c b/src/glsl/nir/nir_search.c
index 0c4e48ce965..c33d6c3eb84 100644
--- a/src/glsl/nir/nir_search.c
+++ b/src/glsl/nir/nir_search.c
@@ -48,7 +48,7 @@ src_is_bool(nir_src src)
       return false;
    if (src.ssa->parent_instr->type != nir_instr_type_alu)
       return false;
-   return alu_instr_is_bool((nir_alu_instr *)src.ssa->parent_instr);
+   return alu_instr_is_bool(nir_instr_as_alu(src.ssa->parent_instr));
 }
 
 static bool
diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c
index da92ed90472..dc799414d24 100644
--- a/src/glsl/nir/nir_validate.c
+++ b/src/glsl/nir/nir_validate.c
@@ -400,11 +400,13 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
       break;
    case nir_intrinsic_store_var:
       assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
-             instr->variables[0]->var->data.mode != nir_var_uniform);
+             instr->variables[0]->var->data.mode != nir_var_uniform &&
+             instr->variables[0]->var->data.mode != nir_var_shader_storage);
       break;
    case nir_intrinsic_copy_var:
       assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
-             instr->variables[0]->var->data.mode != nir_var_uniform);
+             instr->variables[0]->var->data.mode != nir_var_uniform &&
+             instr->variables[0]->var->data.mode != nir_var_shader_storage);
       assert(instr->variables[1]->var->data.mode != nir_var_shader_out);
       break;
    default:
diff --git a/src/glsl/nir/spirv_to_nir_private.h b/src/glsl/nir/spirv_to_nir_private.h
index a964cc80fad..decceff65a6 100644
--- a/src/glsl/nir/spirv_to_nir_private.h
+++ b/src/glsl/nir/spirv_to_nir_private.h
@@ -25,6 +25,7 @@
  *
  */
 
+#include "nir.h"
 #include "nir_spirv.h"
 #include "nir_builder.h"
 #include "spirv.h"
diff --git a/src/glsl/opt_algebraic.cpp b/src/glsl/opt_algebraic.cpp
index fa5db70f2dd..c4b87151199 100644
--- a/src/glsl/opt_algebraic.cpp
+++ b/src/glsl/opt_algebraic.cpp
@@ -580,7 +580,7 @@ ir_algebraic_visitor::handle_expression(ir_expression *ir)
             continue;
 
          ir_expression *add_expr = floor_expr->operands[0]->as_expression();
-         if (!add_expr)
+         if (!add_expr || add_expr->operation != ir_binop_add)
             continue;
 
          for (int j = 0; j < 2; j++) {
@@ -589,7 +589,7 @@ ir_algebraic_visitor::handle_expression(ir_expression *ir)
                continue;
 
             ir_constant *point_five = add_expr->operands[1 - j]->as_constant();
-            if (!point_five->is_value(0.5, 0))
+            if (!point_five || !point_five->is_value(0.5, 0))
                continue;
 
             if (abs_expr->operands[0]->equals(sign_expr->operands[0])) {
diff --git a/src/glsl/opt_constant_propagation.cpp b/src/glsl/opt_constant_propagation.cpp
index 90cc0c89b65..10be8e800ff 100644
--- a/src/glsl/opt_constant_propagation.cpp
+++ b/src/glsl/opt_constant_propagation.cpp
@@ -444,6 +444,14 @@ ir_constant_propagation_visitor::add_constant(ir_assignment *ir)
    if (!deref->var->type->is_vector() && !deref->var->type->is_scalar())
       return;
 
+   /* We can't do copy propagation on buffer variables, since the underlying
+    * memory storage is shared across multiple threads we can't be sure that
+    * the variable value isn't modified between this assignment and the next
+    * instruction where its value is read.
+    */
+   if (deref->var->data.mode == ir_var_shader_storage)
+      return;
+
    entry = new(this->mem_ctx) acp_entry(deref->var, ir->write_mask, constant);
    this->acp->push_tail(entry);
 }
diff --git a/src/glsl/opt_constant_variable.cpp b/src/glsl/opt_constant_variable.cpp
index 7222eb92a7d..7aaaeedf98d 100644
--- a/src/glsl/opt_constant_variable.cpp
+++ b/src/glsl/opt_constant_variable.cpp
@@ -115,6 +115,13 @@ ir_constant_variable_visitor::visit_enter(ir_assignment *ir)
    if (!var)
       return visit_continue;
 
+   /* Ignore buffer variables, since the underlying storage is shared
+    * and we can't be sure that this variable won't be written by another
+    * thread.
+    */
+   if (var->data.mode == ir_var_shader_storage)
+      return visit_continue;
+
    constval = ir->rhs->constant_expression_value();
    if (!constval)
       return visit_continue;
diff --git a/src/glsl/opt_copy_propagation.cpp b/src/glsl/opt_copy_propagation.cpp
index 806027b280e..f20699563fd 100644
--- a/src/glsl/opt_copy_propagation.cpp
+++ b/src/glsl/opt_copy_propagation.cpp
@@ -330,7 +330,7 @@ ir_copy_propagation_visitor::add_copy(ir_assignment *ir)
 	  */
 	 ir->condition = new(ralloc_parent(ir)) ir_constant(false);
 	 this->progress = true;
-      } else {
+      } else if (lhs_var->data.mode != ir_var_shader_storage) {
 	 entry = new(this->acp) acp_entry(lhs_var, rhs_var);
 	 this->acp->push_tail(entry);
       }
diff --git a/src/glsl/opt_dead_code.cpp b/src/glsl/opt_dead_code.cpp
index f45bf5dfdf8..e4bf874700c 100644
--- a/src/glsl/opt_dead_code.cpp
+++ b/src/glsl/opt_dead_code.cpp
@@ -77,11 +77,13 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
 
       if (entry->assign) {
 	 /* Remove a single dead assignment to the variable we found.
-	  * Don't do so if it's a shader or function output, though.
+	  * Don't do so if it's a shader or function output or a shader
+	  * storage variable though.
 	  */
 	 if (entry->var->data.mode != ir_var_function_out &&
 	     entry->var->data.mode != ir_var_function_inout &&
-             entry->var->data.mode != ir_var_shader_out) {
+             entry->var->data.mode != ir_var_shader_out &&
+             entry->var->data.mode != ir_var_shader_storage) {
 	    entry->assign->remove();
 	    progress = true;
 
@@ -99,7 +101,8 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
 	  * stage.  Also, once uniform locations have been assigned, the
 	  * declaration cannot be deleted.
 	  */
-         if (entry->var->data.mode == ir_var_uniform) {
+         if (entry->var->data.mode == ir_var_uniform ||
+             entry->var->data.mode == ir_var_shader_storage) {
             if (uniform_locations_assigned || entry->var->constant_value)
                continue;
 
@@ -115,7 +118,7 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
              * If the variable is in a uniform block with one of those
              * layouts, do not eliminate it.
              */
-            if (entry->var->is_in_uniform_block()) {
+            if (entry->var->is_in_buffer_block()) {
                const glsl_type *const block_type =
                   entry->var->is_interface_instance()
                   ? entry->var->type : entry->var->get_interface_type();
@@ -123,6 +126,9 @@ do_dead_code(exec_list *instructions, bool uniform_locations_assigned)
                if (block_type->interface_packing != GLSL_INTERFACE_PACKING_PACKED)
                   continue;
             }
+
+            if (entry->var->type->is_subroutine())
+               continue;
          }
 
 	 entry->var->remove();
diff --git a/src/glsl/opt_structure_splitting.cpp b/src/glsl/opt_structure_splitting.cpp
index 5e82fe93aa7..abf4310feb3 100644
--- a/src/glsl/opt_structure_splitting.cpp
+++ b/src/glsl/opt_structure_splitting.cpp
@@ -103,8 +103,9 @@ ir_structure_reference_visitor::get_variable_entry(ir_variable *var)
 {
    assert(var);
 
-   if (!var->type->is_record() || var->data.mode == ir_var_uniform
-       || var->data.mode == ir_var_shader_in || var->data.mode == ir_var_shader_out)
+   if (!var->type->is_record() ||
+       var->data.mode == ir_var_uniform || var->data.mode == ir_var_shader_storage ||
+       var->data.mode == ir_var_shader_in || var->data.mode == ir_var_shader_out)
       return NULL;
 
    foreach_in_list(variable_entry, entry, &this->variable_list) {
diff --git a/src/glsl/opt_tree_grafting.cpp b/src/glsl/opt_tree_grafting.cpp
index d47613c2190..7f2ee6cee34 100644
--- a/src/glsl/opt_tree_grafting.cpp
+++ b/src/glsl/opt_tree_grafting.cpp
@@ -359,10 +359,11 @@ tree_grafting_basic_block(ir_instruction *bb_first,
       if (!lhs_var)
 	 continue;
 
-      if (lhs_var->data.mode == ir_var_function_out ||
-	  lhs_var->data.mode == ir_var_function_inout ||
-          lhs_var->data.mode == ir_var_shader_out)
-	 continue;
+   if (lhs_var->data.mode == ir_var_function_out ||
+       lhs_var->data.mode == ir_var_function_inout ||
+       lhs_var->data.mode == ir_var_shader_out ||
+       lhs_var->data.mode == ir_var_shader_storage)
+      continue;
 
       ir_variable_refcount_entry *entry = info->refs->get_variable_entry(lhs_var);
 
diff --git a/src/glsl/program.h b/src/glsl/program.h
index f15113a08d2..c06541a6105 100644
--- a/src/glsl/program.h
+++ b/src/glsl/program.h
@@ -39,6 +39,10 @@ _mesa_glsl_compile_shader(struct gl_context *ctx, struct gl_shader *shader,
 extern void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog);
 
+extern void
+build_program_resource_list(struct gl_context *ctx,
+                            struct gl_shader_program *shProg);
+
 extern void
 linker_error(struct gl_shader_program *prog, const char *fmt, ...)
    PRINTFLIKE(2, 3);
diff --git a/src/glsl/shader_enums.h b/src/glsl/shader_enums.h
index 7ebc3b74b0e..3fef7c48d6f 100644
--- a/src/glsl/shader_enums.h
+++ b/src/glsl/shader_enums.h
@@ -38,9 +38,11 @@
 typedef enum
 {
    MESA_SHADER_VERTEX = 0,
-   MESA_SHADER_GEOMETRY = 1,
-   MESA_SHADER_FRAGMENT = 2,
-   MESA_SHADER_COMPUTE = 3,
+   MESA_SHADER_TESS_CTRL = 1,
+   MESA_SHADER_TESS_EVAL = 2,
+   MESA_SHADER_GEOMETRY = 3,
+   MESA_SHADER_FRAGMENT = 4,
+   MESA_SHADER_COMPUTE = 5,
 } gl_shader_stage;
 
 #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
@@ -152,7 +154,7 @@ typedef enum
     * \name Geometry shader system values
     */
    /*@{*/
-   SYSTEM_VALUE_INVOCATION_ID,
+   SYSTEM_VALUE_INVOCATION_ID,  /**< (Also in Tessellation Control shader) */
    /*@}*/
 
    /**
@@ -165,6 +167,17 @@ typedef enum
    SYSTEM_VALUE_SAMPLE_MASK_IN,
    /*@}*/
 
+   /**
+    * \name Tessellation Evaluation shader system values
+    */
+   /*@{*/
+   SYSTEM_VALUE_TESS_COORD,
+   SYSTEM_VALUE_VERTICES_IN,    /**< Tessellation vertices in input patch */
+   SYSTEM_VALUE_PRIMITIVE_ID,   /**< (currently not used by GS) */
+   SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
+   SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
+   /*@}*/
+
    SYSTEM_VALUE_MAX             /**< Number of values */
 } gl_system_value;
 
@@ -322,15 +335,20 @@ typedef enum
    VARYING_SLOT_BFC0, /* Does not appear in FS */
    VARYING_SLOT_BFC1, /* Does not appear in FS */
    VARYING_SLOT_EDGE, /* Does not appear in FS */
-   VARYING_SLOT_CLIP_VERTEX, /* Does not appear in FS */ VARYING_SLOT_CLIP_DIST0,
+   VARYING_SLOT_CLIP_VERTEX, /* Does not appear in FS */
+   VARYING_SLOT_CLIP_DIST0,
    VARYING_SLOT_CLIP_DIST1,
    VARYING_SLOT_PRIMITIVE_ID, /* Does not appear in VS */
    VARYING_SLOT_LAYER, /* Appears as VS or GS output */
    VARYING_SLOT_VIEWPORT, /* Appears as VS or GS output */
    VARYING_SLOT_FACE, /* FS only */
    VARYING_SLOT_PNTC, /* FS only */
+   VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
+   VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
    VARYING_SLOT_VAR0, /* First generic varying slot */
-   VARYING_SLOT_MAX = VARYING_SLOT_VAR0 + MAX_VARYING
+   VARYING_SLOT_MAX = VARYING_SLOT_VAR0 + MAX_VARYING,
+   VARYING_SLOT_PATCH0 = VARYING_SLOT_MAX,
+   VARYING_SLOT_TESS_MAX = VARYING_SLOT_PATCH0 + MAX_VARYING
 } gl_varying_slot;
 
 
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 050e733d549..6ff9553d6fe 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -33,6 +33,7 @@
 #include <stdio.h>
 #include <string.h>
 #include "util/ralloc.h"
+#include "util/strtod.h"
 
 extern "C" void
 _mesa_error_no_memory(const char *caller)
@@ -138,9 +139,11 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
    ctx->Extensions.ARB_sample_shading = true;
    ctx->Extensions.ARB_shader_bit_encoding = true;
    ctx->Extensions.ARB_shader_stencil_export = true;
+   ctx->Extensions.ARB_shader_subroutine = true;
    ctx->Extensions.ARB_shader_texture_lod = true;
    ctx->Extensions.ARB_shading_language_420pack = true;
    ctx->Extensions.ARB_shading_language_packing = true;
+   ctx->Extensions.ARB_tessellation_shader = true;
    ctx->Extensions.ARB_texture_cube_map_array = true;
    ctx->Extensions.ARB_texture_gather = true;
    ctx->Extensions.ARB_texture_multisample = true;
@@ -197,4 +200,6 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
 
    for (int sh = 0; sh < MESA_SHADER_STAGES; ++sh)
       memcpy(&ctx->Const.ShaderCompilerOptions[sh], &options, sizeof(options));
+
+   _mesa_locale_init();
 }
diff --git a/src/glsl/standalone_scaffolding.h b/src/glsl/standalone_scaffolding.h
index 895dd2782fb..dc6fb640f15 100644
--- a/src/glsl/standalone_scaffolding.h
+++ b/src/glsl/standalone_scaffolding.h
@@ -61,6 +61,10 @@ _mesa_shader_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_SHADER:
       return MESA_SHADER_GEOMETRY;
+   case GL_TESS_CONTROL_SHADER:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SHADER:
+      return MESA_SHADER_TESS_EVAL;
    case GL_COMPUTE_SHADER:
       return MESA_SHADER_COMPUTE;
    default:
diff --git a/src/glsl/test_optpass.cpp b/src/glsl/test_optpass.cpp
index ac3e3f48c51..fed1fabf301 100644
--- a/src/glsl/test_optpass.cpp
+++ b/src/glsl/test_optpass.cpp
@@ -124,7 +124,8 @@ do_optimization(struct exec_list *ir, const char *optimization,
    } else if (sscanf(optimization, "lower_variable_index_to_cond_assign "
                      "( %d , %d , %d , %d ) ", &int_0, &int_1, &int_2,
                      &int_3) == 4) {
-      return lower_variable_index_to_cond_assign(ir, int_0 != 0, int_1 != 0,
+      return lower_variable_index_to_cond_assign(MESA_SHADER_VERTEX, ir,
+                                                 int_0 != 0, int_1 != 0,
                                                  int_2 != 0, int_3 != 0);
    } else if (sscanf(optimization, "lower_quadop_vector ( %d ) ",
                      &int_0) == 1) {
diff --git a/src/glsl/tests/general_ir_test.cpp b/src/glsl/tests/general_ir_test.cpp
index 882642d141b..217305bf847 100644
--- a/src/glsl/tests/general_ir_test.cpp
+++ b/src/glsl/tests/general_ir_test.cpp
@@ -31,11 +31,7 @@ TEST(ir_variable_constructor, interface)
    void *mem_ctx = ralloc_context(NULL);
 
    static const glsl_struct_field f[] = {
-      {
-         glsl_type::vec(4),
-         "v",
-         false
-      }
+      glsl_struct_field(glsl_type::vec(4), "v")
    };
 
    const glsl_type *const interface =
@@ -60,11 +56,7 @@ TEST(ir_variable_constructor, interface_array)
    void *mem_ctx = ralloc_context(NULL);
 
    static const glsl_struct_field f[] = {
-      {
-         glsl_type::vec(4),
-         "v",
-         false
-      }
+      glsl_struct_field(glsl_type::vec(4), "v")
    };
 
    const glsl_type *const interface =
diff --git a/src/glsl/tests/uniform_initializer_utils.cpp b/src/glsl/tests/uniform_initializer_utils.cpp
index b90bdcaed3b..5006387036f 100644
--- a/src/glsl/tests/uniform_initializer_utils.cpp
+++ b/src/glsl/tests/uniform_initializer_utils.cpp
@@ -102,6 +102,7 @@ generate_data_element(void *mem_ctx, const glsl_type *type,
       case GLSL_TYPE_VOID:
       case GLSL_TYPE_ERROR:
       case GLSL_TYPE_INTERFACE:
+      case GLSL_TYPE_SUBROUTINE:
 	 ASSERT_TRUE(false);
 	 break;
       }
@@ -134,6 +135,7 @@ generate_data_element(void *mem_ctx, const glsl_type *type,
       case GLSL_TYPE_VOID:
       case GLSL_TYPE_ERROR:
       case GLSL_TYPE_INTERFACE:
+      case GLSL_TYPE_SUBROUTINE:
 	 ASSERT_TRUE(false);
 	 break;
       }
@@ -238,6 +240,7 @@ verify_data(gl_constant_value *storage, unsigned storage_array_size,
 	 case GLSL_TYPE_VOID:
 	 case GLSL_TYPE_ERROR:
 	 case GLSL_TYPE_INTERFACE:
+	 case GLSL_TYPE_SUBROUTINE:
 	    ASSERT_TRUE(false);
 	    break;
 	 }
diff --git a/src/glsl/tests/varyings_test.cpp b/src/glsl/tests/varyings_test.cpp
index 4573529f619..0c4e0a471b8 100644
--- a/src/glsl/tests/varyings_test.cpp
+++ b/src/glsl/tests/varyings_test.cpp
@@ -70,21 +70,13 @@ public:
    hash_table *consumer_interface_inputs;
 
    const glsl_type *simple_interface;
-   ir_variable *junk[VARYING_SLOT_MAX];
+   ir_variable *junk[VARYING_SLOT_TESS_MAX];
 };
 
 link_varyings::link_varyings()
 {
    static const glsl_struct_field f[] = {
-      {
-         glsl_type::vec(4),
-         "v",
-         false,
-         0,
-         0,
-         0,
-         0
-      }
+      glsl_struct_field(glsl_type::vec(4), "v")
    };
 
    this->simple_interface =
diff --git a/src/glx/compsize.c b/src/glx/compsize.c
index 99c7763c7f9..805591914c5 100644
--- a/src/glx/compsize.c
+++ b/src/glx/compsize.c
@@ -65,6 +65,8 @@ __glElementsPerGroup(GLenum format, GLenum type)
    switch (format) {
    case GL_RGB:
    case GL_BGR:
+   case GL_RGB_INTEGER_EXT:
+   case GL_BGR_INTEGER_EXT:
       return 3;
    case GL_RG:
    case GL_422_EXT:
@@ -74,10 +76,13 @@ __glElementsPerGroup(GLenum format, GLenum type)
    case GL_DEPTH_STENCIL_NV:
    case GL_YCBCR_422_APPLE:
    case GL_LUMINANCE_ALPHA:
+   case GL_LUMINANCE_ALPHA_INTEGER_EXT:
       return 2;
    case GL_RGBA:
    case GL_BGRA:
    case GL_ABGR_EXT:
+   case GL_RGBA_INTEGER_EXT:
+   case GL_BGRA_INTEGER_EXT:
       return 4;
    case GL_COLOR_INDEX:
    case GL_STENCIL_INDEX:
@@ -88,6 +93,11 @@ __glElementsPerGroup(GLenum format, GLenum type)
    case GL_ALPHA:
    case GL_LUMINANCE:
    case GL_INTENSITY:
+   case GL_RED_INTEGER_EXT:
+   case GL_GREEN_INTEGER_EXT:
+   case GL_BLUE_INTEGER_EXT:
+   case GL_ALPHA_INTEGER_EXT:
+   case GL_LUMINANCE_INTEGER_EXT:
       return 1;
    default:
       return 0;
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index dfb0093395f..96f13e6a07b 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -1679,6 +1679,8 @@ dri3_open(Display *dpy,
    fd = xcb_dri3_open_reply_fds(c, reply)[0];
    fcntl(fd, F_SETFD, FD_CLOEXEC);
 
+   free(reply);
+
    return fd;
 }
 
diff --git a/src/glx/glxextensions.c b/src/glx/glxextensions.c
index cb8cd665a88..3b29aef1234 100644
--- a/src/glx/glxextensions.c
+++ b/src/glx/glxextensions.c
@@ -241,6 +241,7 @@ static const struct extension_info known_gl_extensions[] = {
    { GL(EXT_texture_env_combine),        VER(1,3), Y, N, N, N },
    { GL(EXT_texture_env_dot3),           VER(0,0), Y, N, N, N },
    { GL(EXT_texture_filter_anisotropic), VER(0,0), Y, N, N, N },
+   { GL(EXT_texture_integer),            VER(0,0), Y, N, N, N },
    { GL(EXT_texture_lod),                VER(1,2), Y, N, N, N },
    { GL(EXT_texture_lod_bias),           VER(1,4), Y, N, N, N },
    { GL(EXT_texture_mirror_clamp),       VER(0,0), Y, N, N, N },
diff --git a/src/glx/glxextensions.h b/src/glx/glxextensions.h
index 37e4ccc8303..90b173fc915 100644
--- a/src/glx/glxextensions.h
+++ b/src/glx/glxextensions.h
@@ -146,6 +146,7 @@ enum
    GL_EXT_texture_env_combine_bit,
    GL_EXT_texture_env_dot3_bit,
    GL_EXT_texture_filter_anisotropic_bit,
+   GL_EXT_texture_integer_bit,
    GL_EXT_texture_lod_bit,
    GL_EXT_texture_lod_bias_bit,
    GL_EXT_texture_mirror_clamp_bit,
diff --git a/src/glx/pixelstore.c b/src/glx/pixelstore.c
index 1d776b817e5..dc33ff300fc 100644
--- a/src/glx/pixelstore.c
+++ b/src/glx/pixelstore.c
@@ -30,6 +30,7 @@
 
 #include "glxclient.h"
 #include "indirect.h"
+#include "util/rounding.h"
 
 #if !defined(__GNUC__)
 #  define __builtin_expect(x, y) x
@@ -77,7 +78,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
 
    switch (pname) {
    case GL_PACK_ROW_LENGTH:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -85,7 +86,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.rowLength = a;
       break;
    case GL_PACK_IMAGE_HEIGHT:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -93,7 +94,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.imageHeight = a;
       break;
    case GL_PACK_SKIP_ROWS:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -101,7 +102,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.skipRows = a;
       break;
    case GL_PACK_SKIP_PIXELS:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -109,7 +110,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.skipPixels = a;
       break;
    case GL_PACK_SKIP_IMAGES:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -117,7 +118,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storePack.skipImages = a;
       break;
    case GL_PACK_ALIGNMENT:
-      a = (GLint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       switch (a) {
       case 1:
       case 2:
@@ -138,7 +139,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       break;
 
    case GL_UNPACK_ROW_LENGTH:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -146,7 +147,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.rowLength = a;
       break;
    case GL_UNPACK_IMAGE_HEIGHT:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -154,7 +155,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.imageHeight = a;
       break;
    case GL_UNPACK_SKIP_ROWS:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -162,7 +163,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.skipRows = a;
       break;
    case GL_UNPACK_SKIP_PIXELS:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -170,7 +171,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.skipPixels = a;
       break;
    case GL_UNPACK_SKIP_IMAGES:
-      a = (GLuint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       if (((GLint) a) < 0) {
          __glXSetError(gc, GL_INVALID_VALUE);
          return;
@@ -178,7 +179,7 @@ __indirect_glPixelStoref(GLenum pname, GLfloat param)
       state->storeUnpack.skipImages = a;
       break;
    case GL_UNPACK_ALIGNMENT:
-      a = (GLint) (param + 0.5);
+      a = _mesa_lroundevenf(param);
       switch (a) {
       case 1:
       case 2:
diff --git a/src/gtest/Makefile.am b/src/gtest/Makefile.am
index 47d392bc705..29d6c6d1998 100644
--- a/src/gtest/Makefile.am
+++ b/src/gtest/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 AM_CFLAGS = $(DEFINES) -I$(top_srcdir)/src/gtest/include
 AM_CXXFLAGS = $(DEFINES) -I$(top_srcdir)/src/gtest/include
 
diff --git a/src/loader/Android.mk b/src/loader/Android.mk
index 92d9fd20d3c..869056564ce 100644
--- a/src/loader/Android.mk
+++ b/src/loader/Android.mk
@@ -33,10 +33,8 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
 	$(LOADER_C_FILES)
 
-# swrast only
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS += -D__NOT_HAVE_DRM_H
-else
+ifneq ($(filter-out swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DHAVE_LIBDRM
 LOCAL_SHARED_LIBRARIES := libdrm
 endif
 
diff --git a/src/loader/Makefile.am b/src/loader/Makefile.am
index aef1bd61bea..5190f7f8a46 100644
--- a/src/loader/Makefile.am
+++ b/src/loader/Makefile.am
@@ -48,10 +48,7 @@ libloader_la_CPPFLAGS += \
 
 endif
 
-if !HAVE_LIBDRM
-libloader_la_CPPFLAGS += \
-	-D__NOT_HAVE_DRM_H
-else
+if HAVE_LIBDRM
 libloader_la_CPPFLAGS += \
 	$(LIBDRM_CFLAGS)
 
diff --git a/src/loader/SConscript b/src/loader/SConscript
index 16d1053ff2d..d98f11e3cf6 100644
--- a/src/loader/SConscript
+++ b/src/loader/SConscript
@@ -8,8 +8,6 @@ env.Prepend(CPPPATH = [
     '#include'
 ])
 
-env.Append(CPPDEFINES = ['__NOT_HAVE_DRM_H'])
-
 if env['udev']:
     env.PkgUseModules('UDEV')
     env.Append(CPPDEFINES = ['HAVE_LIBUDEV'])
diff --git a/src/loader/loader.c b/src/loader/loader.c
index fc468153425..8634f45e78d 100644
--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -64,6 +64,8 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
+#include <errno.h>
+#include <fcntl.h>
 #include <sys/stat.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -71,10 +73,8 @@
 #ifdef HAVE_LIBUDEV
 #include <assert.h>
 #include <dlfcn.h>
-#include <fcntl.h>
 #include <unistd.h>
 #include <stdlib.h>
-#include <errno.h>
 #ifdef USE_DRICONF
 #include "xmlconfig.h"
 #include "xmlpool.h"
@@ -85,7 +85,7 @@
 #endif
 #include "loader.h"
 
-#ifndef __NOT_HAVE_DRM_H
+#ifdef HAVE_LIBDRM
 #include <xf86drm.h>
 #endif
 
@@ -104,6 +104,22 @@ static void default_logger(int level, const char *fmt, ...)
 
 static void (*log_)(int level, const char *fmt, ...) = default_logger;
 
+int
+loader_open_device(const char *device_name)
+{
+   int fd;
+#ifdef O_CLOEXEC
+   fd = open(device_name, O_RDWR | O_CLOEXEC);
+   if (fd == -1 && errno == EINVAL)
+#endif
+   {
+      fd = open(device_name, O_RDWR);
+      if (fd != -1)
+         fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+   }
+   return fd;
+}
+
 #ifdef HAVE_LIBUDEV
 #include <libudev.h>
 
@@ -112,26 +128,36 @@ static void *udev_handle = NULL;
 static void *
 udev_dlopen_handle(void)
 {
-   if (!udev_handle) {
-      udev_handle = dlopen("libudev.so.1", RTLD_LOCAL | RTLD_LAZY);
+   char name[80];
+   unsigned flags = RTLD_NOLOAD | RTLD_LOCAL | RTLD_LAZY;
+   int version;
 
-      if (!udev_handle) {
-         /* libudev.so.1 changed the return types of the two unref functions
-          * from voids to pointers.  We don't use those return values, and the
-          * only ABI I've heard that cares about this kind of change (calling
-          * a function with a void * return that actually only returns void)
-          * might be ia64.
-          */
-         udev_handle = dlopen("libudev.so.0", RTLD_LOCAL | RTLD_LAZY);
+   /* libudev.so.1 changed the return types of the two unref functions
+    * from voids to pointers.  We don't use those return values, and the
+    * only ABI I've heard that cares about this kind of change (calling
+    * a function with a void * return that actually only returns void)
+    * might be ia64.
+    */
 
-         if (!udev_handle) {
-            log_(_LOADER_WARNING, "Couldn't dlopen libudev.so.1 or "
-                 "libudev.so.0, driver detection may be broken.\n");
-         }
+   /* First try opening an already linked libudev, then try loading one */
+   do {
+      for (version = 1; version >= 0; version--) {
+         snprintf(name, sizeof(name), "libudev.so.%d", version);
+         udev_handle = dlopen(name, flags);
+         if (udev_handle)
+            return udev_handle;
       }
-   }
 
-   return udev_handle;
+      if ((flags & RTLD_NOLOAD) == 0)
+         break;
+
+      flags &= ~RTLD_NOLOAD;
+   } while (1);
+
+   log_(_LOADER_WARNING,
+        "Couldn't dlopen libudev.so.1 or "
+        "libudev.so.0, driver detection may be broken.\n");
+   return NULL;
 }
 
 static int dlsym_failed = 0;
@@ -247,6 +273,8 @@ get_render_node_from_id_path_tag(struct udev *udev,
                (struct udev_enumerate *));
    UDEV_SYMBOL(struct udev_list_entry *, udev_enumerate_get_list_entry,
                (struct udev_enumerate *));
+   UDEV_SYMBOL(void, udev_enumerate_unref,
+               (struct udev_enumerate *));
    UDEV_SYMBOL(struct udev_list_entry *, udev_list_entry_get_next,
                (struct udev_list_entry *));
    UDEV_SYMBOL(const char *, udev_list_entry_get_name,
@@ -281,6 +309,8 @@ get_render_node_from_id_path_tag(struct udev *udev,
       udev_device_unref(device);
    }
 
+   udev_enumerate_unref(e);
+
    if (found) {
       path_res = strdup(udev_device_get_devnode(device));
       udev_device_unref(device);
@@ -314,22 +344,6 @@ get_id_path_tag_from_fd(struct udev *udev, int fd)
    return id_path_tag;
 }
 
-int
-loader_open_device(const char *device_name)
-{
-   int fd;
-#ifdef O_CLOEXEC
-   fd = open(device_name, O_RDWR | O_CLOEXEC);
-   if (fd == -1 && errno == EINVAL)
-#endif
-   {
-      fd = open(device_name, O_RDWR);
-      if (fd != -1)
-         fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
-   }
-   return fd;
-}
-
 #ifdef USE_DRICONF
 const char __driConfigOptionsLoader[] =
 DRI_CONF_BEGIN
@@ -491,7 +505,7 @@ sysfs_get_pci_id_for_fd(int fd, int *vendor_id, int *chip_id)
 }
 #endif
 
-#if !defined(__NOT_HAVE_DRM_H)
+#if defined(HAVE_LIBDRM)
 /* for i915 */
 #include <i915_drm.h>
 /* for radeon */
@@ -574,7 +588,7 @@ loader_get_pci_id_for_fd(int fd, int *vendor_id, int *chip_id)
    if (sysfs_get_pci_id_for_fd(fd, vendor_id, chip_id))
       return 1;
 #endif
-#if !defined(__NOT_HAVE_DRM_H)
+#if HAVE_LIBDRM
    if (drm_get_pci_id_for_fd(fd, vendor_id, chip_id))
       return 1;
 #endif
@@ -685,7 +699,7 @@ loader_get_driver_for_fd(int fd, unsigned driver_types)
 
    if (!loader_get_pci_id_for_fd(fd, &vendor_id, &chip_id)) {
 
-#ifndef __NOT_HAVE_DRM_H
+#if HAVE_LIBDRM
       /* fallback to drmGetVersion(): */
       drmVersionPtr version = drmGetVersion(fd);
 
diff --git a/src/loader/pci_id_driver_map.c b/src/loader/pci_id_driver_map.c
index cb6f705acbd..3c4657fd014 100644
--- a/src/loader/pci_id_driver_map.c
+++ b/src/loader/pci_id_driver_map.c
@@ -23,7 +23,7 @@
 
 int is_nouveau_vieux(int fd);
 
-#ifndef __NOT_HAVE_DRM_H
+#ifdef HAVE_LIBDRM
 
 #include <xf86drm.h>
 #include <nouveau_drm.h>
diff --git a/src/mapi/Makefile.am b/src/mapi/Makefile.am
index 50c5b2ebba3..160a255af6a 100644
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 SUBDIRS =
 TESTS =
 
diff --git a/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml b/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml
new file mode 100644
index 00000000000..14e1c20b9d5
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- This is included by gl_and_es_API.xml.  Could be moved to gl_API.xml. -->
+
+<OpenGLAPI>
+
+<category name="GL_ARB_get_texture_sub_image" number="165">
+
+    <function name="GetTextureSubImage" offset="assign">
+        <param name="texture" type="GLuint"/>
+        <param name="level" type="GLint"/>
+        <param name="xoffset" type="GLint"/>
+        <param name="yoffset" type="GLint"/>
+        <param name="zoffset" type="GLint"/>
+        <param name="width" type="GLsizei"/>
+        <param name="height" type="GLsizei"/>
+        <param name="depth" type="GLsizei"/>
+        <param name="format" type="GLenum"/>
+        <param name="type" type="GLenum"/>
+        <param name="bufSize" type="GLsizei"/>
+        <param name="pixels" type="GLvoid *"/>
+    </function>
+
+    <function name="GetCompressedTextureSubImage" offset="assign">
+        <param name="texture" type="GLuint"/>
+        <param name="level" type="GLint"/>
+        <param name="xoffset" type="GLint"/>
+        <param name="yoffset" type="GLint"/>
+        <param name="zoffset" type="GLint"/>
+        <param name="width" type="GLsizei"/>
+        <param name="height" type="GLsizei"/>
+        <param name="depth" type="GLsizei"/>
+        <param name="bufSize" type="GLsizei"/>
+        <param name="pixels" type="GLvoid *"/>
+    </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/ARB_shader_subroutine.xml b/src/mapi/glapi/gen/ARB_shader_subroutine.xml
new file mode 100644
index 00000000000..04b75cb8f59
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_shader_subroutine.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- Note: no GLX protocol info yet. -->
+
+<OpenGLAPI>
+
+<category name="GL_ARB_shader_subroutine" number="90">
+
+    <function name="GetSubroutineUniformLocation" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="name" type="const GLchar *"/>
+        <return type="GLint"/>
+    </function>
+
+    <function name="GetSubroutineIndex" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="name" type="const GLchar *"/>
+        <return type="GLuint"/>
+    </function>
+
+    <function name="GetActiveSubroutineUniformiv" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="index" type="GLuint"/>
+        <param name="pname" type="GLenum"/>
+        <param name="values" type="GLint *" output="true"/>
+    </function>
+
+    <function name="GetActiveSubroutineUniformName" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="index" type="GLuint"/>
+        <param name="bufsize" type="GLsizei"/>
+        <param name="length" type="GLsizei *" output="true"/>
+        <param name="name" type="GLchar *" output="true"/>
+    </function>
+
+    <function name="GetActiveSubroutineName" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="index" type="GLuint"/>
+        <param name="bufsize" type="GLsizei"/>
+        <param name="length" type="GLsizei *" output="true"/>
+        <param name="name" type="GLchar *" output="true"/>
+    </function>
+
+    <function name="UniformSubroutinesuiv" offset="assign">
+        <param name="shadertype" type="GLenum"/>
+        <param name="count" type="GLsizei"/>
+        <param name="indices" type="const GLuint *"/>
+    </function>
+
+    <function name="GetUniformSubroutineuiv" offset="assign">
+        <param name="shadertype" type="GLenum"/>
+        <param name="location" type="GLint"/>
+        <param name="params" type="GLuint *" output="true"/>
+    </function>
+
+    <function name="GetProgramStageiv" offset="assign">
+        <param name="program" type="GLuint"/>
+        <param name="shadertype" type="GLenum"/>
+        <param name="pname" type="GLenum"/>
+        <param name="values" type="GLint *" output="true"/>
+    </function>
+
+    <enum name="ACTIVE_SUBROUTINES" value="0x8DE5"/>
+    <enum name="ACTIVE_SUBROUTINE_UNIFORMS" value="0x8DE6"/>
+    <enum name="ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS" value="0x8E47"/>
+    <enum name="ACTIVE_SUBROUTINE_MAX_LENGTH" value="0x8E48"/>
+    <enum name="ACTIVE_SUBROUTINE_UNIFORM_MAX_LENGTH" value="0x8E49"/>
+
+    <enum name="MAX_SUBROUTINES" value="0x8DE7"/>
+    <enum name="MAX_SUBROUTINE_UNIFORM_LOCATIONS" value="0x8DE8"/>
+
+    <enum name="NUM_COMPATIBLE_SUBROUTINES" value="0x8E4A"/>
+    <enum name="COMPATIBLE_SUBROUTINES" value="0x8E4B"/>
+
+    <!-- UNIFORM_SIZE, UNIFORM_NAME_LENGTH already in GL3.1 -->
+
+</category>
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/ARB_tessellation_shader.xml b/src/mapi/glapi/gen/ARB_tessellation_shader.xml
new file mode 100644
index 00000000000..16a213933ef
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_tessellation_shader.xml
@@ -0,0 +1,62 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- Note: no GLX protocol info yet. -->
+
+<OpenGLAPI>
+
+
+<category name="GL_ARB_tessellation_shader" number="91">
+
+    <!--<enum value="0" name="FALSE"/>
+    <enum value="1" name="TRUE"/>
+    <enum value="0x0004" name="TRIANGLES"/>
+    <enum value="0x0007" name="QUADS"/>
+    <enum value="0x0202" name="EQUAL"/>
+    <enum value="0x0900" name="CW"/>
+    <enum value="0x0901" name="CCW"/>-->
+
+    <enum value="0x000E" name="PATCHES"/>
+    <enum value="0x84F0" name="UNIFORM_BLOCK_REFERENCED_BY_TESS_CONTROL_SHADER"/>
+    <enum value="0x84F1" name="UNIFORM_BLOCK_REFERENCED_BY_TESS_EVALUATION_SHADER"/>
+    <enum value="0x886C" name="MAX_TESS_CONTROL_INPUT_COMPONENTS"/>
+    <enum value="0x886D" name="MAX_TESS_EVALUATION_INPUT_COMPONENTS"/>
+    <enum value="0x8E1E" name="MAX_COMBINED_TESS_CONTROL_UNIFORM_COMPONENTS"/>
+    <enum value="0x8E1F" name="MAX_COMBINED_TESS_EVALUATION_UNIFORM_COMPONENTS"/>
+    <enum value="0x8E72" name="PATCH_VERTICES"/>
+    <enum value="0x8E73" name="PATCH_DEFAULT_INNER_LEVEL"/>
+    <enum value="0x8E74" name="PATCH_DEFAULT_OUTER_LEVEL"/>
+    <enum value="0x8E75" name="TESS_CONTROL_OUTPUT_VERTICES"/>
+    <enum value="0x8E76" name="TESS_GEN_MODE"/>
+    <enum value="0x8E77" name="TESS_GEN_SPACING"/>
+    <enum value="0x8E78" name="TESS_GEN_VERTEX_ORDER"/>
+    <enum value="0x8E79" name="TESS_GEN_POINT_MODE"/>
+    <enum value="0x8E7A" name="ISOLINES"/>
+    <enum value="0x8E7B" name="FRACTIONAL_ODD"/>
+    <enum value="0x8E7C" name="FRACTIONAL_EVEN"/>
+    <enum value="0x8E7D" name="MAX_PATCH_VERTICES"/>
+    <enum value="0x8E7E" name="MAX_TESS_GEN_LEVEL"/>
+    <enum value="0x8E7F" name="MAX_TESS_CONTROL_UNIFORM_COMPONENTS"/>
+    <enum value="0x8E80" name="MAX_TESS_EVALUATION_UNIFORM_COMPONENTS"/>
+    <enum value="0x8E81" name="MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS"/>
+    <enum value="0x8E82" name="MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS"/>
+    <enum value="0x8E83" name="MAX_TESS_CONTROL_OUTPUT_COMPONENTS"/>
+    <enum value="0x8E84" name="MAX_TESS_PATCH_COMPONENTS"/>
+    <enum value="0x8E85" name="MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS"/>
+    <enum value="0x8E86" name="MAX_TESS_EVALUATION_OUTPUT_COMPONENTS"/>
+    <enum value="0x8E87" name="TESS_EVALUATION_SHADER"/>
+    <enum value="0x8E88" name="TESS_CONTROL_SHADER"/>
+    <enum value="0x8E89" name="MAX_TESS_CONTROL_UNIFORM_BLOCKS"/>
+    <enum value="0x8E8A" name="MAX_TESS_EVALUATION_UNIFORM_BLOCKS"/>
+
+    <function name="PatchParameteri" offset="assign">
+        <param name="pname" type="GLenum"/>
+        <param name="value" type="GLint"/>
+    </function>
+    <function name="PatchParameterfv" offset="assign">
+        <param name="pname" type="GLenum"/>
+        <param name="values" type="const GLfloat *"/>
+    </function>
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 5b163b02e00..7d9d1a61215 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -135,6 +135,7 @@ API_XML = \
 	ARB_framebuffer_object.xml \
 	ARB_geometry_shader4.xml \
 	ARB_get_program_binary.xml \
+	ARB_get_texture_sub_image.xml \
 	ARB_gpu_shader_fp64.xml \
 	ARB_gpu_shader5.xml \
 	ARB_instanced_arrays.xml \
@@ -151,6 +152,7 @@ API_XML = \
 	ARB_separate_shader_objects.xml \
 	ARB_shader_atomic_counters.xml \
 	ARB_shader_image_load_store.xml \
+	ARB_shader_subroutine.xml \
 	ARB_sync.xml \
 	ARB_texture_barrier.xml \
 	ARB_texture_buffer_object.xml \
diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
index b623b44beeb..3a0eb1869a8 100644
--- a/src/mapi/glapi/gen/apiexec.py
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -74,6 +74,17 @@ functions = {
     # GL_ARB_geometry_shader4, so OpenGL 3.2 is required.
     "FramebufferTexture": exec_info(core=32),
 
+    # OpenGL 4.0 / GL_ARB_shader_subroutines. Mesa only exposes this
+    # extension with core profile.
+    "GetSubroutineUniformLocation": exec_info(core=31),
+    "GetSubroutineIndex": exec_info(core=31),
+    "GetActiveSubroutineUniformiv": exec_info(core=31),
+    "GetActiveSubroutineUniformName": exec_info(core=31),
+    "GetActiveSubroutineName": exec_info(core=31),
+    "UniformSubroutinesuiv": exec_info(core=31),
+    "GetUniformSubroutineuiv": exec_info(core=31),
+    "GetProgramStageiv": exec_info(core=31),
+
     # OpenGL 4.0 / GL_ARB_gpu_shader_fp64.  The extension spec says:
     #
     #     "OpenGL 3.2 and GLSL 1.50 are required."
diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index da468dc5876..5d95f278a91 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -549,9 +549,9 @@ def main():
     """Main function."""
     args = _parser()
 
-    if args._mode == "dispatch_c":
+    if args.mode == "dispatch_c":
         printer = PrintGlxDispatchFunctions(args.swap)
-    elif args._mode == "dispatch_h":
+    elif args.mode == "dispatch_h":
         printer = PrintGlxDispatch_h()
 
     api = gl_XML.parse_GL_API(
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 2f330756f22..658efa485f6 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8072,7 +8072,13 @@
 
 <xi:include href="ARB_vertex_type_2_10_10_10_rev.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions #86...#93 -->
+<!-- ARB extensions #86...#89 -->
+
+<xi:include href="ARB_shader_subroutine.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<xi:include href="ARB_tessellation_shader.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<!-- ARB extensions #92...#93 -->
 
 <xi:include href="ARB_draw_indirect.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
@@ -8253,7 +8259,9 @@
 
 <xi:include href="ARB_direct_state_access.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions 165 - 166 -->
+<xi:include href="ARB_get_texture_sub_image.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<!-- ARB extension 166 -->
 
 <xi:include href="ARB_texture_barrier.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
diff --git a/src/mapi/glapi/gen/gl_enums.py b/src/mapi/glapi/gen/gl_enums.py
index 955f27d0818..041c2f8ddb8 100644
--- a/src/mapi/glapi/gen/gl_enums.py
+++ b/src/mapi/glapi/gen/gl_enums.py
@@ -78,7 +78,7 @@ static int compar_nr( const int *a, enum_elt *b )
 
 static char token_tmp[20];
 
-const char *_mesa_lookup_enum_by_nr( int nr )
+const char *_mesa_enum_to_string( int nr )
 {
    enum_elt *elt;
 
@@ -118,6 +118,7 @@ static const char *prim_names[PRIM_MAX+3] = {
    "GL_LINE_STRIP_ADJACENCY",
    "GL_TRIANGLES_ADJACENCY",
    "GL_TRIANGLE_STRIP_ADJACENCY",
+   "GL_PATCHES",
    "outside begin/end",
    "unknown state"
 };
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 4ba5b2fac29..eb4a3da3c84 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -19,8 +19,6 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
 SUBDIRS = . main/tests
 
 if HAVE_X11_DRIVER
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 83f500fbf20..ed9848c5454 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -407,6 +407,7 @@ STATETRACKER_FILES = \
 	state_tracker/st_atom_shader.c \
 	state_tracker/st_atom_shader.h \
 	state_tracker/st_atom_stipple.c \
+	state_tracker/st_atom_tess.c \
 	state_tracker/st_atom_texture.c \
 	state_tracker/st_atom_viewport.c \
 	state_tracker/st_cache.h \
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 71c1a763912..6fe42b1775c 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -94,14 +94,14 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->QuerySamplesForFormat = _mesa_query_samples_for_format;
    driver->TexImage = _mesa_store_teximage;
    driver->TexSubImage = _mesa_store_texsubimage;
-   driver->GetTexImage = _mesa_meta_GetTexImage;
+   driver->GetTexSubImage = _mesa_meta_GetTexSubImage;
    driver->ClearTexSubImage = _mesa_meta_ClearTexSubImage;
    driver->CopyTexSubImage = _mesa_meta_CopyTexSubImage;
    driver->GenerateMipmap = _mesa_meta_GenerateMipmap;
    driver->TestProxyTexImage = _mesa_test_proxy_teximage;
    driver->CompressedTexImage = _mesa_store_compressed_teximage;
    driver->CompressedTexSubImage = _mesa_store_compressed_texsubimage;
-   driver->GetCompressedTexImage = _mesa_GetCompressedTexImage_sw;
+   driver->GetCompressedTexSubImage = _mesa_GetCompressedTexSubImage_sw;
    driver->BindTexture = NULL;
    driver->NewTextureObject = _mesa_new_texture_object;
    driver->DeleteTexture = _mesa_delete_texture_object;
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 214a68a9129..bde544ef490 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -728,7 +728,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
       save->DepthNear = ctx->ViewportArray[0].Near;
       save->DepthFar = ctx->ViewportArray[0].Far;
       /* set depth range to default */
-      _mesa_DepthRange(0.0, 1.0);
+      _mesa_set_depth_range(ctx, 0, 0.0, 1.0);
    }
 
    if (state & MESA_META_CLAMP_FRAGMENT_COLOR) {
@@ -945,6 +945,8 @@ _mesa_meta_end(struct gl_context *ctx)
    if (state & MESA_META_SHADER) {
       static const GLenum targets[] = {
          GL_VERTEX_SHADER,
+         GL_TESS_CONTROL_SHADER,
+         GL_TESS_EVALUATION_SHADER,
          GL_GEOMETRY_SHADER,
          GL_FRAGMENT_SHADER,
       };
@@ -1129,7 +1131,7 @@ _mesa_meta_end(struct gl_context *ctx)
          _mesa_set_viewport(ctx, 0, save->ViewportX, save->ViewportY,
                             save->ViewportW, save->ViewportH);
       }
-      _mesa_DepthRange(save->DepthNear, save->DepthFar);
+      _mesa_set_depth_range(ctx, 0, save->DepthNear, save->DepthFar);
    }
 
    if (state & MESA_META_CLAMP_FRAGMENT_COLOR &&
@@ -2449,30 +2451,53 @@ _mesa_meta_Bitmap(struct gl_context *ctx,
 
 /**
  * Compute the texture coordinates for the four vertices of a quad for
- * drawing a 2D texture image or slice of a cube/3D texture.
+ * drawing a 2D texture image or slice of a cube/3D texture.  The offset
+ * and width, height specify a sub-region of the 2D image.
+ *
  * \param faceTarget  GL_TEXTURE_1D/2D/3D or cube face name
  * \param slice  slice of a 1D/2D array texture or 3D texture
- * \param width  width of the texture image
- * \param height  height of the texture image
+ * \param xoffset  X position of sub texture
+ * \param yoffset  Y position of sub texture
+ * \param width  width of the sub texture image
+ * \param height  height of the sub texture image
+ * \param total_width  total width of the texture image
+ * \param total_height  total height of the texture image
+ * \param total_depth  total depth of the texture image
  * \param coords0/1/2/3  returns the computed texcoords
  */
 void
 _mesa_meta_setup_texture_coords(GLenum faceTarget,
                                 GLint slice,
+                                GLint xoffset,
+                                GLint yoffset,
                                 GLint width,
                                 GLint height,
-                                GLint depth,
+                                GLint total_width,
+                                GLint total_height,
+                                GLint total_depth,
                                 GLfloat coords0[4],
                                 GLfloat coords1[4],
                                 GLfloat coords2[4],
                                 GLfloat coords3[4])
 {
-   static const GLfloat st[4][2] = {
-      {0.0f, 0.0f}, {1.0f, 0.0f}, {1.0f, 1.0f}, {0.0f, 1.0f}
-   };
+   float st[4][2];
    GLuint i;
+   const float s0 = (float) xoffset / (float) total_width;
+   const float s1 = (float) (xoffset + width) / (float) total_width;
+   const float t0 = (float) yoffset / (float) total_height;
+   const float t1 = (float) (yoffset + height) / (float) total_height;
    GLfloat r;
 
+   /* setup the reference texcoords */
+   st[0][0] = s0;
+   st[0][1] = t0;
+   st[1][0] = s1;
+   st[1][1] = t0;
+   st[2][0] = s1;
+   st[2][1] = t1;
+   st[3][0] = s0;
+   st[3][1] = t1;
+
    if (faceTarget == GL_TEXTURE_CUBE_MAP_ARRAY)
       faceTarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + slice % 6;
 
@@ -2489,52 +2514,52 @@ _mesa_meta_setup_texture_coords(GLenum faceTarget,
    case GL_TEXTURE_3D:
    case GL_TEXTURE_2D_ARRAY:
       if (faceTarget == GL_TEXTURE_3D) {
-         assert(slice < depth);
-         assert(depth >= 1);
-         r = (slice + 0.5f) / depth;
+         assert(slice < total_depth);
+         assert(total_depth >= 1);
+         r = (slice + 0.5f) / total_depth;
       }
       else if (faceTarget == GL_TEXTURE_2D_ARRAY)
          r = (float) slice;
       else
          r = 0.0F;
-      coords0[0] = 0.0F; /* s */
-      coords0[1] = 0.0F; /* t */
+      coords0[0] = st[0][0]; /* s */
+      coords0[1] = st[0][1]; /* t */
       coords0[2] = r; /* r */
-      coords1[0] = 1.0F;
-      coords1[1] = 0.0F;
+      coords1[0] = st[1][0];
+      coords1[1] = st[1][1];
       coords1[2] = r;
-      coords2[0] = 1.0F;
-      coords2[1] = 1.0F;
+      coords2[0] = st[2][0];
+      coords2[1] = st[2][1];
       coords2[2] = r;
-      coords3[0] = 0.0F;
-      coords3[1] = 1.0F;
+      coords3[0] = st[3][0];
+      coords3[1] = st[3][1];
       coords3[2] = r;
       break;
    case GL_TEXTURE_RECTANGLE_ARB:
-      coords0[0] = 0.0F; /* s */
-      coords0[1] = 0.0F; /* t */
+      coords0[0] = (float) xoffset; /* s */
+      coords0[1] = (float) yoffset; /* t */
       coords0[2] = 0.0F; /* r */
-      coords1[0] = (float) width;
-      coords1[1] = 0.0F;
+      coords1[0] = (float) (xoffset + width);
+      coords1[1] = (float) yoffset;
       coords1[2] = 0.0F;
-      coords2[0] = (float) width;
-      coords2[1] = (float) height;
+      coords2[0] = (float) (xoffset + width);
+      coords2[1] = (float) (yoffset + height);
       coords2[2] = 0.0F;
-      coords3[0] = 0.0F;
-      coords3[1] = (float) height;
+      coords3[0] = (float) xoffset;
+      coords3[1] = (float) (yoffset + height);
       coords3[2] = 0.0F;
       break;
    case GL_TEXTURE_1D_ARRAY:
-      coords0[0] = 0.0F; /* s */
+      coords0[0] = st[0][0]; /* s */
       coords0[1] = (float) slice; /* t */
       coords0[2] = 0.0F; /* r */
-      coords1[0] = 1.0f;
+      coords1[0] = st[1][0];
       coords1[1] = (float) slice;
       coords1[2] = 0.0F;
-      coords2[0] = 1.0F;
+      coords2[0] = st[2][0];
       coords2[1] = (float) slice;
       coords2[2] = 0.0F;
-      coords3[0] = 0.0F;
+      coords3[0] = st[3][0];
       coords3[1] = (float) slice;
       coords3[2] = 0.0F;
       break;
@@ -2943,15 +2968,14 @@ static bool
 decompress_texture_image(struct gl_context *ctx,
                          struct gl_texture_image *texImage,
                          GLuint slice,
+                         GLint xoffset, GLint yoffset,
+                         GLsizei width, GLsizei height,
                          GLenum destFormat, GLenum destType,
                          GLvoid *dest)
 {
    struct decompress_state *decompress = &ctx->Meta->Decompress;
    struct decompress_fbo_state *decompress_fbo;
    struct gl_texture_object *texObj = texImage->TexObject;
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Height;
    const GLenum target = texObj->Target;
    GLenum rbFormat;
    GLenum faceTarget;
@@ -3069,7 +3093,10 @@ decompress_texture_image(struct gl_context *ctx,
    /* Silence valgrind warnings about reading uninitialized stack. */
    memset(verts, 0, sizeof(verts));
 
-   _mesa_meta_setup_texture_coords(faceTarget, slice, width, height, depth,
+   _mesa_meta_setup_texture_coords(faceTarget, slice,
+                                   xoffset, yoffset, width, height,
+                                   texImage->Width, texImage->Height,
+                                   texImage->Depth,
                                    verts[0].tex,
                                    verts[1].tex,
                                    verts[2].tex,
@@ -3123,7 +3150,7 @@ decompress_texture_image(struct gl_context *ctx,
    /* read pixels from renderbuffer */
    {
       GLenum baseTexFormat = texImage->_BaseFormat;
-      GLenum destBaseFormat = _mesa_base_tex_format(ctx, destFormat);
+      GLenum destBaseFormat = _mesa_unpack_format_to_base_format(destFormat);
 
       /* The pixel transfer state will be set to default values at this point
        * (see MESA_META_PIXEL_TRANSFER) so pixel transfer ops are effectively
@@ -3132,19 +3159,13 @@ decompress_texture_image(struct gl_context *ctx,
        * returned as red and two-channel texture values are returned as
        * red/alpha.
        */
-      if ((baseTexFormat == GL_LUMINANCE ||
-           baseTexFormat == GL_LUMINANCE_ALPHA ||
-           baseTexFormat == GL_INTENSITY) ||
+      if (_mesa_need_luminance_to_rgb_conversion(baseTexFormat,
+                                                 destBaseFormat) ||
           /* If we're reading back an RGB(A) texture (using glGetTexImage) as
 	   * luminance then we need to return L=tex(R).
 	   */
-          ((baseTexFormat == GL_RGBA ||
-            baseTexFormat == GL_RGB  ||
-            baseTexFormat == GL_RG) &&
-          (destBaseFormat == GL_LUMINANCE ||
-           destBaseFormat == GL_LUMINANCE_ALPHA ||
-           destBaseFormat == GL_LUMINANCE_INTEGER_EXT ||
-           destBaseFormat == GL_LUMINANCE_ALPHA_INTEGER_EXT))) {
+          _mesa_need_rgb_to_luminance_conversion(baseTexFormat,
+                                                 destBaseFormat)) {
          /* Green and blue must be zero */
          _mesa_PixelTransferf(GL_GREEN_SCALE, 0.0f);
          _mesa_PixelTransferf(GL_BLUE_SCALE, 0.0f);
@@ -3171,15 +3192,17 @@ decompress_texture_image(struct gl_context *ctx,
  * from core Mesa.
  */
 void
-_mesa_meta_GetTexImage(struct gl_context *ctx,
-                       GLenum format, GLenum type, GLvoid *pixels,
-                       struct gl_texture_image *texImage)
+_mesa_meta_GetTexSubImage(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage)
 {
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
       GLuint slice;
       bool result = true;
 
-      for (slice = 0; slice < texImage->Depth; slice++) {
+      for (slice = 0; slice < depth; slice++) {
          void *dst;
          if (texImage->TexObject->Target == GL_TEXTURE_2D_ARRAY
              || texImage->TexObject->Target == GL_TEXTURE_CUBE_MAP_ARRAY) {
@@ -3191,14 +3214,14 @@ _mesa_meta_GetTexImage(struct gl_context *ctx,
             struct gl_pixelstore_attrib packing = ctx->Pack;
             packing.SkipPixels = 0;
             packing.SkipRows = 0;
-            dst = _mesa_image_address3d(&packing, pixels, texImage->Width,
-                                        texImage->Height, format, type,
-                                        slice, 0, 0);
+            dst = _mesa_image_address3d(&packing, pixels, width, height,
+                                        format, type, slice, 0, 0);
          }
          else {
             dst = pixels;
          }
          result = decompress_texture_image(ctx, texImage, slice,
+                                           xoffset, yoffset, width, height,
                                            format, type, dst);
          if (!result)
             break;
@@ -3208,7 +3231,8 @@ _mesa_meta_GetTexImage(struct gl_context *ctx,
          return;
    }
 
-   _mesa_GetTexImage_sw(ctx, format, type, pixels, texImage);
+   _mesa_GetTexSubImage_sw(ctx, xoffset, yoffset, zoffset,
+                           width, height, depth, format, type, pixels, texImage);
 }
 
 
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index e7d894df1d7..fe439153aa0 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -560,9 +560,11 @@ _mesa_meta_ClearTexSubImage(struct gl_context *ctx,
                             const GLvoid *clearValue);
 
 extern void
-_mesa_meta_GetTexImage(struct gl_context *ctx,
-                       GLenum format, GLenum type, GLvoid *pixels,
-                       struct gl_texture_image *texImage);
+_mesa_meta_GetTexSubImage(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage);
 
 extern void
 _mesa_meta_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
@@ -594,9 +596,13 @@ _mesa_meta_alloc_texture(struct temp_texture *tex,
 void
 _mesa_meta_setup_texture_coords(GLenum faceTarget,
                                 GLint slice,
+                                GLint xoffset,
+                                GLint yoffset,
                                 GLint width,
                                 GLint height,
-                                GLint depth,
+                                GLint total_width,
+                                GLint total_height,
+                                GLint total_depth,
                                 GLfloat coords0[4],
                                 GLfloat coords1[4],
                                 GLfloat coords2[4],
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 9cace2b245a..71d18de87db 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -82,7 +82,7 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
    y_scale = samples * 0.5;
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && is_power_of_two(samples));
+   assert(samples > 0 && _mesa_is_pow_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -263,7 +263,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
    }
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && is_power_of_two(samples));
+   assert(samples > 0 && _mesa_is_pow_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -312,7 +312,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
       break;
    default:
       _mesa_problem(ctx, "Unkown texture target %s\n",
-                    _mesa_lookup_enum_by_nr(target));
+                    _mesa_enum_to_string(target));
       shader_index = BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_RESOLVE;
    }
 
@@ -434,7 +434,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
           * (so the floating point exponent just gets increased), rather than
           * doing a naive sum and dividing.
           */
-         assert(is_power_of_two(samples));
+         assert(_mesa_is_pow_two(samples));
          /* Fetch each individual sample. */
          sample_resolve = rzalloc_size(mem_ctx, 1);
          for (i = 0; i < samples; i++) {
diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c
index 1729766f78d..149ed18503c 100644
--- a/src/mesa/drivers/common/meta_copy_image.c
+++ b/src/mesa/drivers/common/meta_copy_image.c
@@ -138,8 +138,8 @@ _mesa_meta_CopyImageSubData_uncompressed(struct gl_context *ctx,
          goto cleanup;
    }
 
-   /* We really only need to stash the bound framebuffers. */
-   _mesa_meta_begin(ctx, 0);
+   /* We really only need to stash the bound framebuffers and scissor. */
+   _mesa_meta_begin(ctx, MESA_META_SCISSOR);
 
    _mesa_GenFramebuffers(2, fbos);
    _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, fbos[0]);
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index c1b6d3c1f86..0655f052219 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -66,7 +66,7 @@ fallback_required(struct gl_context *ctx, GLenum target,
    if (target == GL_TEXTURE_3D) {
       _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH,
                        "glGenerateMipmap() to %s target\n",
-                       _mesa_lookup_enum_by_nr(target));
+                       _mesa_enum_to_string(target));
       return true;
    }
 
@@ -317,7 +317,9 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
          /* Setup texture coordinates */
          _mesa_meta_setup_texture_coords(faceTarget,
                                          layer,
-                                         0, 0, 1, /* width, height never used here */
+                                         0, 0, /* xoffset, yoffset */
+                                         srcWidth, srcHeight, /* img size */
+                                         srcWidth, srcHeight, srcDepth,
                                          verts[0].tex,
                                          verts[1].tex,
                                          verts[2].tex,
diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index d2474f52718..16d8f5d4747 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -25,8 +25,10 @@
  *    Jason Ekstrand <jason.ekstrand@intel.com>
  */
 
+#include "blend.h"
 #include "bufferobj.h"
 #include "buffers.h"
+#include "clear.h"
 #include "fbobject.h"
 #include "glformats.h"
 #include "glheader.h"
@@ -248,6 +250,24 @@ fail:
    return success;
 }
 
+static bool
+need_signed_unsigned_int_conversion(mesa_format rbFormat,
+                                    GLenum format, GLenum type)
+{
+   const GLenum srcType = _mesa_get_format_datatype(rbFormat);
+   const bool is_dst_format_integer = _mesa_is_enum_format_integer(format);
+   return (srcType == GL_INT &&
+           is_dst_format_integer &&
+           (type == GL_UNSIGNED_INT ||
+            type == GL_UNSIGNED_SHORT ||
+            type == GL_UNSIGNED_BYTE)) ||
+          (srcType == GL_UNSIGNED_INT &&
+           is_dst_format_integer &&
+           (type == GL_INT ||
+            type == GL_SHORT ||
+            type == GL_BYTE));
+}
+
 bool
 _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                               struct gl_texture_image *tex_image,
@@ -260,8 +280,10 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    int full_height, image_height;
    struct gl_texture_image *pbo_tex_image;
    struct gl_renderbuffer *rb = NULL;
-   GLenum status;
-   bool success = false;
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
+   GLenum status, src_base_format;
+   bool success = false, clear_channels_to_zero = false;
+   float save_clear_color[4];
    int z;
 
    if (!_mesa_is_bufferobj(packing->BufferObj))
@@ -273,13 +295,27 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
        format == GL_COLOR_INDEX)
       return false;
 
-   if (ctx->_ImageTransferState)
-      return false;
-
-
+   /* Don't use meta path for readpixels in below conditions. */
    if (!tex_image) {
       rb = ctx->ReadBuffer->_ColorReadBuffer;
-      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format))
+
+      /* _mesa_get_readpixels_transfer_ops() includes the cases of read
+       * color clamping along with the ctx->_ImageTransferState.
+       */
+      if (_mesa_get_readpixels_transfer_ops(ctx, rb->Format, format,
+                                            type, GL_FALSE))
+         return false;
+
+      if (_mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat,
+                                                 dstBaseFormat))
+         return false;
+
+      /* This function rely on BlitFramebuffer to fill in the pixel data for
+       * ReadPixels. But, BlitFrameBuffer doesn't support signed to unsigned
+       * or unsigned to signed integer conversions. OpenGL spec expects an
+       * invalid operation in that case.
+       */
+      if (need_signed_unsigned_int_conversion(rb->Format, format, type))
          return false;
    }
 
@@ -300,6 +336,10 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER |
                            MESA_META_PIXEL_STORE));
 
+   /* GL_CLAMP_FRAGMENT_COLOR doesn't affect ReadPixels and GettexImage */
+   if (ctx->Extensions.ARB_color_buffer_float)
+      _mesa_ClampColor(GL_CLAMP_FRAGMENT_COLOR, GL_FALSE);
+
    _mesa_GenFramebuffers(2, fbos);
 
    if (tex_image && tex_image->TexObject->Target == GL_TEXTURE_1D_ARRAY) {
@@ -345,6 +385,27 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                                   GL_COLOR_BUFFER_BIT, GL_NEAREST))
       goto fail;
 
+   src_base_format = tex_image ?
+                     tex_image->_BaseFormat :
+                     ctx->ReadBuffer->_ColorReadBuffer->_BaseFormat;
+
+   /* Depending on the base formats involved we might need to rebase some
+    * values. For example if we download from a Luminance format to RGBA
+    * format, we want G=0 and B=0.
+    */
+   clear_channels_to_zero =
+      _mesa_need_luminance_to_rgb_conversion(src_base_format,
+                                             pbo_tex_image->_BaseFormat);
+
+   if (clear_channels_to_zero) {
+      memcpy(save_clear_color, ctx->Color.ClearColor.f, 4 * sizeof(float));
+      /* Clear the Green, Blue channels. */
+      _mesa_ColorMask(GL_FALSE, GL_TRUE, GL_TRUE,
+                      src_base_format != GL_LUMINANCE_ALPHA);
+      _mesa_ClearColor(0.0, 0.0, 0.0, 1.0);
+      _mesa_Clear(GL_COLOR_BUFFER_BIT);
+   }
+
    for (z = 1; z < depth; z++) {
       _mesa_meta_bind_fbo_image(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
                                 tex_image, zoffset + z);
@@ -357,6 +418,15 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
                                  0, z * image_height,
                                  width, z * image_height + height,
                                  GL_COLOR_BUFFER_BIT, GL_NEAREST);
+      if (clear_channels_to_zero)
+         _mesa_Clear(GL_COLOR_BUFFER_BIT);
+   }
+
+   /* Unmask the color channels and restore the saved clear color values. */
+   if (clear_channels_to_zero) {
+      _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+      _mesa_ClearColor(save_clear_color[0], save_clear_color[1],
+                       save_clear_color[2], save_clear_color[3]);
    }
 
    success = true;
diff --git a/src/mesa/drivers/dri/common/Android.mk b/src/mesa/drivers/dri/common/Android.mk
index 6986f5e8cb4..f1a733011b9 100644
--- a/src/mesa/drivers/dri/common/Android.mk
+++ b/src/mesa/drivers/dri/common/Android.mk
@@ -43,13 +43,6 @@ LOCAL_EXPORT_C_INCLUDE_DIRS := \
     $(LOCAL_PATH) \
     $(intermediates)
 
-# swrast only
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS := -D__NOT_HAVE_DRM_H
-else
-LOCAL_SHARED_LIBRARIES := libdrm
-endif
-
 LOCAL_SRC_FILES := \
 	$(DRI_COMMON_FILES) \
 	$(XMLCONFIG_FILES)
@@ -110,13 +103,6 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_C_INCLUDES := \
     $(MESA_DRI_C_INCLUDES)
 
-# swrast only
-ifeq ($(MESA_GPU_DRIVERS),swrast)
-LOCAL_CFLAGS := -D__NOT_HAVE_DRM_H
-else
-LOCAL_SHARED_LIBRARIES := libdrm
-endif
-
 LOCAL_SRC_FILES := $(megadriver_stub_FILES)
 
 include $(MESA_COMMON_MK)
diff --git a/src/mesa/drivers/dri/common/Makefile.am b/src/mesa/drivers/dri/common/Makefile.am
index ae19fcb3565..b307f10f56b 100644
--- a/src/mesa/drivers/dri/common/Makefile.am
+++ b/src/mesa/drivers/dri/common/Makefile.am
@@ -32,6 +32,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
+	$(LIBDRM_CFLAGS) \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS)
 
@@ -53,10 +54,3 @@ libdri_test_stubs_la_CFLAGS = $(AM_CFLAGS) -DNO_MAIN
 libmegadriver_stub_la_SOURCES = $(megadriver_stub_FILES)
 
 sysconf_DATA = drirc
-
-if DRICOMMON_NEED_LIBDRM
-AM_CFLAGS += $(LIBDRM_CFLAGS)
-libdricommon_la_LIBADD = $(LIBDRM_LIBS)
-else
-AM_CFLAGS += -D__NOT_HAVE_DRM_H
-endif
diff --git a/src/mesa/drivers/dri/common/SConscript b/src/mesa/drivers/dri/common/SConscript
index b402736db69..52d201f8913 100644
--- a/src/mesa/drivers/dri/common/SConscript
+++ b/src/mesa/drivers/dri/common/SConscript
@@ -32,11 +32,6 @@ drienv.AppendUnique(LIBS = [
     'expat',
 ])
 
-# if HAVE_DRI2
-drienv.PkgUseModules('DRM')
-# else
-#env.Append(CPPDEFINES = ['__NOT_HAVE_DRM_H'])
-
 sources = drienv.ParseSourceList('Makefile.sources', ['DRI_COMMON_FILES', 'XMLCONFIG_FILES' ])
 
 dri_common = drienv.ConvenienceLibrary(
@@ -57,7 +52,6 @@ env.Append(CPPPATH = [
 ])
 
 env.Append(CPPDEFINES = [
-    '__NOT_HAVE_DRM_H',
     'HAVE_DLADDR',
 ])
 
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index e7ababe0b67..d35ac263a45 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -40,13 +40,9 @@
 
 
 #include <stdbool.h>
-#ifndef __NOT_HAVE_DRM_H
-#include <xf86drm.h>
-#endif
 #include "dri_util.h"
 #include "utils.h"
 #include "xmlpool.h"
-#include "../glsl/glsl_parser_extras.h"
 #include "main/mtypes.h"
 #include "main/version.h"
 #include "main/errors.h"
@@ -138,18 +134,6 @@ driCreateNewScreen2(int scrn, int fd,
 
     setupLoaderExtensions(psp, extensions);
 
-#ifndef __NOT_HAVE_DRM_H
-    if (fd != -1) {
-       drmVersionPtr version = drmGetVersion(fd);
-       if (version) {
-          psp->drm_version.major = version->version_major;
-          psp->drm_version.minor = version->version_minor;
-          psp->drm_version.patch = version->version_patchlevel;
-          drmFreeVersion(version);
-       }
-    }
-#endif
-
     psp->loaderPrivate = data;
 
     psp->extensions = emptyExtensionList;
@@ -179,7 +163,9 @@ driCreateNewScreen2(int scrn, int fd,
        }
     }
 
-    psp->api_mask = (1 << __DRI_API_OPENGL);
+    psp->api_mask = 0;
+    if (psp->max_gl_compat_version > 0)
+       psp->api_mask |= (1 << __DRI_API_OPENGL);
     if (psp->max_gl_core_version > 0)
        psp->api_mask |= (1 << __DRI_API_OPENGL_CORE);
     if (psp->max_gl_es1_version > 0)
@@ -238,8 +224,6 @@ static void driDestroyScreen(__DRIscreen *psp)
 	 * stream open to the X-server anymore.
 	 */
 
-       _mesa_destroy_shader_compiler();
-
 	psp->driver->DestroyScreen(psp);
 
 	driDestroyOptionCache(&psp->optionCache);
diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h
index 1138bf106de..6987f555e66 100644
--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -148,11 +148,6 @@ struct __DRIscreenRec {
      */
     int fd;
 
-    /**
-     * DRM (kernel module) version information.
-     */
-    __DRIversion drm_version;
-
     /**
      * Device-dependent private information (not stored in the SAREA).
      * 
diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc
index 145e707a64c..97d961b6597 100644
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -4,24 +4,15 @@
 Application bugs worked around in this file:
 ============================================
 
+* Unigine Heaven 3.0 and older contain too many bugs and can't be supported
+  by drivers that want to be compliant.
+
 * Various Unigine products don't use the #version and #extension GLSL
   directives, meaning they only get GLSL 1.10 and no extensions for their
   shaders.
   Enabling all extensions for Unigine fixes most issues, but the GLSL version
   is still 1.10.
 
-* Unigine Heaven 3.0 with ARB_texture_multisample uses a "ivec4 * vec4"
-  expression, which is illegal in GLSL 1.10.
-  Adding "#version 130" fixes this.
-
-* Unigine Heaven 3.0 with ARB_shader_bit_encoding uses the uint keyword, which
-  is illegal in GLSL 1.10.
-  Adding "#version 130" fixes this.
-
-* Unigine Heaven 3.0 with ARB_shader_bit_encoding uses a "uint & int"
-  expression, which is illegal in any GLSL version.
-  Disabling ARB_shader_bit_encoding fixes this.
-
 * If ARB_sample_shading is supported, Unigine Heaven 4.0 and Valley 1.0 uses
   an #extension directive in the middle of its shaders, which is illegal
   in GLSL.
@@ -45,18 +36,10 @@ TODO: document the other workarounds.
 	</application>
 
         <application name="Unigine Heaven (32-bit)" executable="heaven_x86">
-            <option name="force_glsl_extensions_warn" value="true" />
-            <option name="disable_blend_func_extended" value="true" />
-            <option name="force_glsl_version" value="130" />
-            <option name="disable_shader_bit_encoding" value="true" />
             <option name="allow_glsl_extension_directive_midshader" value="true" />
 	</application>
 
         <application name="Unigine Heaven (64-bit)" executable="heaven_x64">
-            <option name="force_glsl_extensions_warn" value="true" />
-            <option name="disable_blend_func_extended" value="true" />
-            <option name="force_glsl_version" value="130" />
-            <option name="disable_shader_bit_encoding" value="true" />
             <option name="allow_glsl_extension_directive_midshader" value="true" />
 	</application>
 
diff --git a/src/mesa/drivers/dri/common/utils.c b/src/mesa/drivers/dri/common/utils.c
index 70d34e8ce55..b51b263fe46 100644
--- a/src/mesa/drivers/dri/common/utils.c
+++ b/src/mesa/drivers/dri/common/utils.c
@@ -213,6 +213,7 @@ driCreateConfigs(mesa_format format,
       masks = masks_table[0];
       break;
    case MESA_FORMAT_B8G8R8X8_UNORM:
+   case MESA_FORMAT_B8G8R8X8_SRGB:
       masks = masks_table[1];
       break;
    case MESA_FORMAT_B8G8R8A8_UNORM:
diff --git a/src/mesa/drivers/dri/i915/i830_state.c b/src/mesa/drivers/dri/i915/i830_state.c
index ea54e2b25b1..906e942b020 100644
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -57,7 +57,7 @@ i830StencilFuncSeparate(struct gl_context * ctx, GLenum face, GLenum func, GLint
    mask = mask & 0xff;
 
    DBG("%s : func: %s, ref : 0x%x, mask: 0x%x\n", __func__,
-       _mesa_lookup_enum_by_nr(func), ref, mask);
+       _mesa_enum_to_string(func), ref, mask);
 
 
    I830_STATECHANGE(i830, I830_UPLOAD_CTX);
@@ -95,9 +95,9 @@ i830StencilOpSeparate(struct gl_context * ctx, GLenum face, GLenum fail, GLenum
    int fop, dfop, dpop;
 
    DBG("%s: fail : %s, zfail: %s, zpass : %s\n", __func__,
-       _mesa_lookup_enum_by_nr(fail),
-       _mesa_lookup_enum_by_nr(zfail), 
-       _mesa_lookup_enum_by_nr(zpass));
+       _mesa_enum_to_string(fail),
+       _mesa_enum_to_string(zfail), 
+       _mesa_enum_to_string(zpass));
 
    fop = 0;
    dfop = 0;
@@ -389,8 +389,8 @@ static void
 i830BlendEquationSeparate(struct gl_context * ctx, GLenum modeRGB, GLenum modeA)
 {
    DBG("%s -> %s, %s\n", __func__,
-       _mesa_lookup_enum_by_nr(modeRGB),
-       _mesa_lookup_enum_by_nr(modeA));
+       _mesa_enum_to_string(modeRGB),
+       _mesa_enum_to_string(modeA));
 
    (void) modeRGB;
    (void) modeA;
@@ -403,10 +403,10 @@ i830BlendFuncSeparate(struct gl_context * ctx, GLenum sfactorRGB,
                       GLenum dfactorRGB, GLenum sfactorA, GLenum dfactorA)
 {
    DBG("%s -> RGB(%s, %s) A(%s, %s)\n", __func__,
-       _mesa_lookup_enum_by_nr(sfactorRGB),
-       _mesa_lookup_enum_by_nr(dfactorRGB),
-       _mesa_lookup_enum_by_nr(sfactorA),
-       _mesa_lookup_enum_by_nr(dfactorA));
+       _mesa_enum_to_string(sfactorRGB),
+       _mesa_enum_to_string(dfactorRGB),
+       _mesa_enum_to_string(sfactorA),
+       _mesa_enum_to_string(dfactorA));
 
    (void) sfactorRGB;
    (void) dfactorRGB;
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 42ea54e087d..57b033c07ea 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -255,6 +255,8 @@ i915CreateContext(int api,
     * FINISHME: vertex shaders?
     */
    ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitCondCodes = true;
+   ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectSampler =
+      true;
 
    struct gl_shader_compiler_options *const fs_options =
       & ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT];
@@ -266,6 +268,7 @@ i915CreateContext(int api,
    fs_options->EmitNoIndirectOutput = true;
    fs_options->EmitNoIndirectUniform = true;
    fs_options->EmitNoIndirectTemp = true;
+   fs_options->EmitNoIndirectSampler = true;
 
    ctx->Const.MaxDrawBuffers = 1;
    ctx->Const.QueryCounterBits.SamplesPassed = 0;
diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index 5f10b840b1a..4c83073e692 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -402,7 +402,7 @@ void
 intelCalcViewport(struct gl_context * ctx)
 {
    struct intel_context *intel = intel_context(ctx);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
 
    _mesa_get_viewport_xform(ctx, 0, scale, translate);
 
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index aef5ff99eb2..f653f441ad8 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -342,7 +342,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
        * Thus, I guess we need do this for other platforms as well.
        */
       if (tObj->Target == GL_TEXTURE_CUBE_MAP_ARB &&
-          !is_power_of_two(firstImage->Height))
+          !_mesa_is_pow_two(firstImage->Height))
          return false;
 
       state[I915_TEXREG_SS3] = ss3;     /* SS3_NORMALIZED_COORDS */
diff --git a/src/mesa/drivers/dri/i915/intel_context.c b/src/mesa/drivers/dri/i915/intel_context.c
index 5618dcd8358..c780103228f 100644
--- a/src/mesa/drivers/dri/i915/intel_context.c
+++ b/src/mesa/drivers/dri/i915/intel_context.c
@@ -428,7 +428,6 @@ intelInitContext(struct intel_context *intel,
 
    driContextPriv->driverPrivate = intel;
    intel->driContext = driContextPriv;
-   intel->driFd = sPriv->fd;
 
    intel->gen = intelScreen->gen;
 
diff --git a/src/mesa/drivers/dri/i915/intel_context.h b/src/mesa/drivers/dri/i915/intel_context.h
index 350d35d9033..4ec4015d453 100644
--- a/src/mesa/drivers/dri/i915/intel_context.h
+++ b/src/mesa/drivers/dri/i915/intel_context.h
@@ -273,8 +273,6 @@ struct intel_context
 
    bool use_early_z;
 
-   int driFd;
-
    __DRIcontext *driContext;
    struct intel_screen *intelScreen;
 
diff --git a/src/mesa/drivers/dri/i915/intel_fbo.c b/src/mesa/drivers/dri/i915/intel_fbo.c
index a5d5c5832fb..67013666377 100644
--- a/src/mesa/drivers/dri/i915/intel_fbo.c
+++ b/src/mesa/drivers/dri/i915/intel_fbo.c
@@ -216,7 +216,7 @@ intel_alloc_private_renderbuffer_storage(struct gl_context * ctx, struct gl_rend
    intel_miptree_release(&irb->mt);
 
    DBG("%s: %s: %s (%dx%d)\n", __func__,
-       _mesa_lookup_enum_by_nr(internalFormat),
+       _mesa_enum_to_string(internalFormat),
        _mesa_get_format_name(rb->Format), width, height);
 
    if (width == 0 || height == 0)
diff --git a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
index e56b9859377..1aa06c18f15 100644
--- a/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i915/intel_mipmap_tree.c
@@ -81,7 +81,7 @@ intel_miptree_create_layout(struct intel_context *intel,
       return NULL;
 
    DBG("%s target %s format %s level %d..%d <-- %p\n", __func__,
-       _mesa_lookup_enum_by_nr(target),
+       _mesa_enum_to_string(target),
        _mesa_get_format_name(format),
        first_level, last_level, mt);
 
diff --git a/src/mesa/drivers/dri/i915/intel_render.c b/src/mesa/drivers/dri/i915/intel_render.c
index 0b0d48e1663..5962dad7d11 100644
--- a/src/mesa/drivers/dri/i915/intel_render.c
+++ b/src/mesa/drivers/dri/i915/intel_render.c
@@ -113,7 +113,7 @@ static void
 intelDmaPrimitive(struct intel_context *intel, GLenum prim)
 {
    if (0)
-      fprintf(stderr, "%s %s\n", __func__, _mesa_lookup_enum_by_nr(prim));
+      fprintf(stderr, "%s %s\n", __func__, _mesa_enum_to_string(prim));
    INTEL_FIREVERTICES(intel);
    intel->vtbl.reduced_primitive_state(intel, reduced_prim[prim]);
    intel_set_prim(intel, hw_prim[prim]);
diff --git a/src/mesa/drivers/dri/i915/intel_tex_image.c b/src/mesa/drivers/dri/i915/intel_tex_image.c
index 01de966a134..0a213e9f614 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_image.c
@@ -189,7 +189,7 @@ intelTexImage(struct gl_context * ctx,
               const struct gl_pixelstore_attrib *unpack)
 {
    DBG("%s target %s level %d %dx%dx%d\n", __func__,
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
+       _mesa_enum_to_string(texImage->TexObject->Target),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    /* Attempt to use the blitter for PBO image uploads.
diff --git a/src/mesa/drivers/dri/i915/intel_tex_subimage.c b/src/mesa/drivers/dri/i915/intel_tex_subimage.c
index 2e02d50f13f..f11ef2ea329 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_subimage.c
@@ -72,7 +72,7 @@ intel_blit_texsubimage(struct gl_context * ctx,
 
    DBG("BLT subimage %s target %s level %d offset %d,%d %dx%d\n",
        __func__,
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
+       _mesa_enum_to_string(texImage->TexObject->Target),
        texImage->Level, xoffset, yoffset, width, height);
 
    pixels = _mesa_validate_pbo_teximage(ctx, 2, width, height, 1,
diff --git a/src/mesa/drivers/dri/i915/intel_tris.c b/src/mesa/drivers/dri/i915/intel_tris.c
index 144f0fc911a..ae62a800fb7 100644
--- a/src/mesa/drivers/dri/i915/intel_tris.c
+++ b/src/mesa/drivers/dri/i915/intel_tris.c
@@ -1134,7 +1134,7 @@ intelRasterPrimitive(struct gl_context * ctx, GLenum rprim, GLuint hwprim)
 
    if (0)
       fprintf(stderr, "%s %s %x\n", __func__,
-              _mesa_lookup_enum_by_nr(rprim), hwprim);
+              _mesa_enum_to_string(rprim), hwprim);
 
    intel->vtbl.reduced_primitive_state(intel, rprim);
 
@@ -1158,7 +1158,7 @@ intelRenderPrimitive(struct gl_context * ctx, GLenum prim)
                          ctx->Polygon.BackMode != GL_FILL);
 
    if (0)
-      fprintf(stderr, "%s %s\n", __func__, _mesa_lookup_enum_by_nr(prim));
+      fprintf(stderr, "%s %s\n", __func__, _mesa_enum_to_string(prim));
 
    /* Let some clipping routines know which primitive they're dealing
     * with.
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 981fe79b132..dfdad75329d 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -60,6 +60,8 @@ i965_FILES = \
 	brw_fs_register_coalesce.cpp \
 	brw_fs_saturate_propagation.cpp \
 	brw_fs_sel_peephole.cpp \
+	brw_fs_surface_builder.cpp \
+	brw_fs_surface_builder.h \
 	brw_fs_vector_splitting.cpp \
 	brw_fs_visitor.cpp \
 	brw_gs.c \
@@ -86,6 +88,7 @@ i965_FILES = \
 	brw_object_purgeable.c \
 	brw_packed_float.c \
 	brw_performance_monitor.c \
+	brw_pipe_control.c \
 	brw_primitive_restart.c \
 	brw_program.c \
 	brw_program.h \
@@ -122,6 +125,8 @@ i965_FILES = \
 	brw_vec4.h \
 	brw_vec4_live_variables.cpp \
 	brw_vec4_live_variables.h \
+	brw_vec4_nir.cpp \
+	brw_vec4_gs_nir.cpp \
 	brw_vec4_reg_allocate.cpp \
 	brw_vec4_visitor.cpp \
 	brw_vec4_vp.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 98ff0ddcd58..b188fc7de57 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -44,6 +44,41 @@
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
 
+static const GLuint stage_to_bt_edit[] = {
+   [MESA_SHADER_VERTEX] = _3DSTATE_BINDING_TABLE_EDIT_VS,
+   [MESA_SHADER_GEOMETRY] = _3DSTATE_BINDING_TABLE_EDIT_GS,
+   [MESA_SHADER_FRAGMENT] = _3DSTATE_BINDING_TABLE_EDIT_PS,
+};
+
+static uint32_t
+reserve_hw_bt_space(struct brw_context *brw, unsigned bytes)
+{
+   /* From the Broadwell PRM, Volume 16, "Workarounds",
+    * WaStateBindingTableOverfetch:
+    * "HW over-fetches two cache lines of binding table indices.  When
+    *  using the resource streamer, SW needs to pad binding table pointer
+    *  updates with an additional two cache lines."
+    *
+    * Cache lines are 64 bytes, so we subtract 128 bytes from the size of
+    * the binding table pool buffer.
+    */
+   if (brw->hw_bt_pool.next_offset + bytes >= brw->hw_bt_pool.bo->size - 128) {
+      gen7_reset_hw_bt_pool_offsets(brw);
+   }
+
+   uint32_t offset = brw->hw_bt_pool.next_offset;
+
+   /* From the Haswell PRM, Volume 2b: Command Reference: Instructions,
+    * 3DSTATE_BINDING_TABLE_POINTERS_xS:
+    *
+    * "If HW Binding Table is enabled, the offset is relative to the
+    *  Binding Table Pool Base Address and the alignment is 64 bytes."
+    */
+   brw->hw_bt_pool.next_offset += ALIGN(bytes, 64);
+
+   return offset;
+}
+
 /**
  * Upload a shader stage's binding table as indirect state.
  *
@@ -72,22 +107,41 @@ brw_upload_binding_table(struct brw_context *brw,
             brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW,
             brw->shader_time.bo->size, 1, true);
       }
+      /* When RS is enabled use hw-binding table uploads, otherwise fallback to
+       * software-uploads.
+       */
+      if (brw->use_resource_streamer) {
+         gen7_update_binding_table_from_array(brw, stage_state->stage,
+                                              stage_state->surf_offset,
+                                              prog_data->binding_table
+                                              .size_bytes / 4);
+      } else {
+         uint32_t *bind = brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
+                                          prog_data->binding_table.size_bytes,
+                                          32,
+                                          &stage_state->bind_bo_offset);
 
-      uint32_t *bind = brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
-                                       prog_data->binding_table.size_bytes, 32,
-                                       &stage_state->bind_bo_offset);
-
-      /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
-      memcpy(bind, stage_state->surf_offset,
-             prog_data->binding_table.size_bytes);
+         /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
+         memcpy(bind, stage_state->surf_offset,
+                prog_data->binding_table.size_bytes);
+      }
    }
 
    brw->ctx.NewDriverState |= brw_new_binding_table;
 
    if (brw->gen >= 7) {
+      if (brw->use_resource_streamer) {
+         stage_state->bind_bo_offset =
+            reserve_hw_bt_space(brw, prog_data->binding_table.size_bytes);
+      }
       BEGIN_BATCH(2);
       OUT_BATCH(packet_name << 16 | (2 - 2));
-      OUT_BATCH(stage_state->bind_bo_offset);
+      /* Align SurfaceStateOffset[16:6] format to [15:5] PS Binding Table field
+       * when hw-generated binding table is enabled.
+       */
+      OUT_BATCH(brw->use_resource_streamer ?
+                (stage_state->bind_bo_offset >> 1) :
+                stage_state->bind_bo_offset);
       ADVANCE_BATCH();
    }
 }
@@ -170,6 +224,158 @@ const struct brw_tracked_state brw_gs_binding_table = {
    .emit = brw_gs_upload_binding_table,
 };
 
+/**
+ * Edit a single entry in a hardware-generated binding table
+ */
+void
+gen7_edit_hw_binding_table_entry(struct brw_context *brw,
+                                 gl_shader_stage stage,
+                                 uint32_t index,
+                                 uint32_t surf_offset)
+{
+   assert(stage < ARRAY_SIZE(stage_to_bt_edit));
+   assert(stage_to_bt_edit[stage]);
+
+   uint32_t dw2 = SET_FIELD(index, BRW_BINDING_TABLE_INDEX) |
+      (brw->gen >= 8 ? GEN8_SURFACE_STATE_EDIT(surf_offset) :
+       HSW_SURFACE_STATE_EDIT(surf_offset));
+
+   BEGIN_BATCH(3);
+   OUT_BATCH(stage_to_bt_edit[stage] << 16 | (3 - 2));
+   OUT_BATCH(BRW_BINDING_TABLE_EDIT_TARGET_ALL);
+   OUT_BATCH(dw2);
+   ADVANCE_BATCH();
+}
+
+/**
+ * Upload a whole hardware binding table for the given stage.
+ *
+ * Takes an array of surface offsets and the number of binding table
+ * entries.
+ */
+void
+gen7_update_binding_table_from_array(struct brw_context *brw,
+                                     gl_shader_stage stage,
+                                     const uint32_t* binding_table,
+                                     int num_surfaces)
+{
+   uint32_t dw2 = 0;
+
+   assert(stage < ARRAY_SIZE(stage_to_bt_edit));
+   assert(stage_to_bt_edit[stage]);
+
+   BEGIN_BATCH(num_surfaces + 2);
+   OUT_BATCH(stage_to_bt_edit[stage] << 16 | num_surfaces);
+   OUT_BATCH(BRW_BINDING_TABLE_EDIT_TARGET_ALL);
+   for (int i = 0; i < num_surfaces; i++) {
+      dw2 = SET_FIELD(i, BRW_BINDING_TABLE_INDEX) |
+         (brw->gen >= 8 ? GEN8_SURFACE_STATE_EDIT(binding_table[i]) :
+          HSW_SURFACE_STATE_EDIT(binding_table[i]));
+      OUT_BATCH(dw2);
+   }
+   ADVANCE_BATCH();
+}
+
+/**
+ * Disable hardware binding table support, falling back to the
+ * older software-generated binding table mechanism.
+ */
+void
+gen7_disable_hw_binding_tables(struct brw_context *brw)
+{
+   if (!brw->use_resource_streamer)
+      return;
+   /* From the Haswell PRM, Volume 7: 3D Media GPGPU,
+    * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+    *
+    * "When switching between HW and SW binding table generation, SW must
+    * issue a state cache invalidate."
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+
+   int pkt_len = brw->gen >= 8 ? 4 : 3;
+
+   BEGIN_BATCH(pkt_len);
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POOL_ALLOC << 16 | (pkt_len - 2));
+   if (brw->gen >= 8) {
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH(HSW_BT_POOL_ALLOC_MUST_BE_ONE);
+      OUT_BATCH(0);
+   }
+   ADVANCE_BATCH();
+}
+
+/**
+ * Enable hardware binding tables and set up the binding table pool.
+ */
+void
+gen7_enable_hw_binding_tables(struct brw_context *brw)
+{
+   if (!brw->use_resource_streamer)
+      return;
+
+   if (!brw->hw_bt_pool.bo) {
+      /* We use a single re-usable buffer object for the lifetime of the
+       * context and size it to maximum allowed binding tables that can be
+       * programmed per batch:
+       *
+       * From the Haswell PRM, Volume 7: 3D Media GPGPU,
+       * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+       * "A maximum of 16,383 Binding tables are allowed in any batch buffer"
+       */
+      static const int max_size = 16383 * 4;
+      brw->hw_bt_pool.bo = drm_intel_bo_alloc(brw->bufmgr, "hw_bt",
+                                              max_size, 64);
+      brw->hw_bt_pool.next_offset = 0;
+   }
+
+   /* From the Haswell PRM, Volume 7: 3D Media GPGPU,
+    * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
+    *
+    * "When switching between HW and SW binding table generation, SW must
+    * issue a state cache invalidate."
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+
+   int pkt_len = brw->gen >= 8 ? 4 : 3;
+   uint32_t dw1 = BRW_HW_BINDING_TABLE_ENABLE;
+   if (brw->is_haswell) {
+      dw1 |= SET_FIELD(GEN7_MOCS_L3, GEN7_HW_BT_POOL_MOCS) |
+             HSW_BT_POOL_ALLOC_MUST_BE_ONE;
+   } else if (brw->gen >= 8) {
+      dw1 |= BDW_MOCS_WB;
+   }
+
+   BEGIN_BATCH(pkt_len);
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POOL_ALLOC << 16 | (pkt_len - 2));
+   if (brw->gen >= 8) {
+      OUT_RELOC64(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1);
+      OUT_BATCH(brw->hw_bt_pool.bo->size);
+   } else {
+      OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1);
+      OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0,
+             brw->hw_bt_pool.bo->size);
+   }
+   ADVANCE_BATCH();
+}
+
+void
+gen7_reset_hw_bt_pool_offsets(struct brw_context *brw)
+{
+   brw->hw_bt_pool.next_offset = 0;
+}
+
+const struct brw_tracked_state gen7_hw_binding_tables = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+   },
+   .emit = gen7_enable_hw_binding_tables
+};
+
 /** @} */
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index b404869f0c7..eac1f005496 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -220,13 +220,13 @@ brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params)
     * data with different formats, which blorp does for stencil and depth
     * data.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
 retry:
    intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
    intel_batchbuffer_save_state(brw);
    drm_intel_bo *saved_bo = brw->batch.bo;
-   uint32_t saved_used = brw->batch.used;
+   uint32_t saved_used = USED_BATCH(brw->batch);
    uint32_t saved_state_batch_offset = brw->batch.state_batch_offset;
 
    switch (brw->gen) {
@@ -245,7 +245,7 @@ retry:
     * reserved enough space that a wrap will never happen.
     */
    assert(brw->batch.bo == saved_bo);
-   assert((brw->batch.used - saved_used) * 4 +
+   assert((USED_BATCH(brw->batch) - saved_used) * 4 +
           (saved_state_batch_offset - brw->batch.state_batch_offset) <
           estimated_max_batch_usage);
    /* Shut up compiler warnings on release build */
@@ -283,7 +283,7 @@ retry:
    /* Flush the sampler cache so any texturing from the destination is
     * coherent.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 brw_hiz_op_params::brw_hiz_op_params(struct intel_mipmap_tree *mt,
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 1561b593969..205c905b447 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -1285,8 +1285,8 @@ brw_blorp_blit_program::translate_dst_to_src()
       /* Round the float coordinates down to nearest integer */
       emit_rndd(Xp_f, X_f);
       emit_rndd(Yp_f, Y_f);
-      emit_mul(X_f, Xp_f, brw_imm_f(1 / key->x_scale));
-      emit_mul(Y_f, Yp_f, brw_imm_f(1 / key->y_scale));
+      emit_mul(X_f, Xp_f, brw_imm_f(1.0f / key->x_scale));
+      emit_mul(Y_f, Yp_f, brw_imm_f(1.0f / key->y_scale));
       SWAP_XY_AND_XPYP();
    } else if (!key->bilinear_filter) {
       /* Round the float coordinates down to nearest integer by moving to
@@ -1442,7 +1442,7 @@ brw_blorp_blit_program::manual_blend_average(unsigned num_samples)
       for (int j = 0; j < 4; ++j) {
          emit_mul(offset(texture_data[0], 2*j),
                  offset(vec8(texture_data[0]), 2*j),
-                 brw_imm_f(1.0/num_samples));
+                 brw_imm_f(1.0f / num_samples));
       }
    }
 
@@ -1475,9 +1475,9 @@ brw_blorp_blit_program::manual_blend_bilinear(unsigned num_samples)
 
       /* Compute pixel coordinates */
       emit_add(vec16(x_sample_coords), Xp_f,
-              brw_imm_f((float)(i & 0x1) * (1.0 / key->x_scale)));
+              brw_imm_f((float)(i & 0x1) * (1.0f / key->x_scale)));
       emit_add(vec16(y_sample_coords), Yp_f,
-              brw_imm_f((float)((i >> 1) & 0x1) * (1.0 / key->y_scale)));
+              brw_imm_f((float)((i >> 1) & 0x1) * (1.0f / key->y_scale)));
       emit_mov(vec16(X), x_sample_coords);
       emit_mov(vec16(Y), y_sample_coords);
 
@@ -1789,7 +1789,7 @@ brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1,
        * so 0.5 provides the necessary correction.
        */
       multiplier = scale;
-      offset = src0 + (-dst0 + 0.5) * scale;
+      offset = src0 + (-dst0 + 0.5f) * scale;
    } else {
       /* When mirroring X we need:
        *   src_x - src_x0 = dst_x1 - dst_x - 0.5
@@ -1797,7 +1797,7 @@ brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1,
        *   src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
        */
       multiplier = -scale;
-      offset = src0 + (dst1 - 0.5) * scale;
+      offset = src0 + (dst1 - 0.5f) * scale;
    }
 }
 
@@ -1952,8 +1952,8 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
    /* Scaling factors used for bilinear filtering in multisample scaled
     * blits.
     */
-   wm_prog_key.x_scale = 2.0;
-   wm_prog_key.y_scale = src_mt->num_samples / 2.0;
+   wm_prog_key.x_scale = 2.0f;
+   wm_prog_key.y_scale = src_mt->num_samples / 2.0f;
 
    if (filter == GL_LINEAR && src.num_samples <= 1 && dst.num_samples <= 1)
       wm_prog_key.bilinear_filter = true;
@@ -2000,9 +2000,9 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
    x1 = wm_push_consts.dst_x1 = roundf(dst_x1);
    y1 = wm_push_consts.dst_y1 = roundf(dst_y1);
    wm_push_consts.rect_grid_x1 = (minify(src_mt->logical_width0, src_level) *
-                                  wm_prog_key.x_scale - 1.0);
+                                  wm_prog_key.x_scale - 1.0f);
    wm_push_consts.rect_grid_y1 = (minify(src_mt->logical_height0, src_level) *
-                                  wm_prog_key.y_scale - 1.0);
+                                  wm_prog_key.y_scale - 1.0f);
 
    wm_push_consts.x_transform.setup(src_x0, src_x1, dst_x0, dst_x1, mirror_x);
    wm_push_consts.y_transform.setup(src_y0, src_y1, dst_y0, dst_y1, mirror_y);
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index 789520c7353..d458ad846bf 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -73,7 +73,7 @@ brw_blorp_eu_emitter::emit_kill_if_outside_rect(const struct brw_reg &x,
    emit_cmp(BRW_CONDITIONAL_L, x, dst_x1)->predicate = BRW_PREDICATE_NORMAL;
    emit_cmp(BRW_CONDITIONAL_L, y, dst_y1)->predicate = BRW_PREDICATE_NORMAL;
 
-   fs_inst *inst = new (mem_ctx) fs_inst(BRW_OPCODE_AND, g1, f0, g1);
+   fs_inst *inst = new (mem_ctx) fs_inst(BRW_OPCODE_AND, 16, g1, f0, g1);
    inst->force_writemask_all = true;
    insts.push_tail(inst);
 }
@@ -84,7 +84,7 @@ brw_blorp_eu_emitter::emit_texture_lookup(const struct brw_reg &dst,
                                           unsigned base_mrf,
                                           unsigned msg_length)
 {
-   fs_inst *inst = new (mem_ctx) fs_inst(op, dst, brw_message_reg(base_mrf),
+   fs_inst *inst = new (mem_ctx) fs_inst(op, 16, dst, brw_message_reg(base_mrf),
                                          fs_reg(0u));
 
    inst->base_mrf = base_mrf;
@@ -119,7 +119,8 @@ brw_blorp_eu_emitter::emit_combine(enum opcode combine_opcode,
 {
    assert(combine_opcode == BRW_OPCODE_ADD || combine_opcode == BRW_OPCODE_AVG);
 
-   insts.push_tail(new (mem_ctx) fs_inst(combine_opcode, dst, src_1, src_2));
+   insts.push_tail(new (mem_ctx) fs_inst(combine_opcode, 16, dst,
+                                         src_1, src_2));
 }
 
 fs_inst *
@@ -127,7 +128,7 @@ brw_blorp_eu_emitter::emit_cmp(enum brw_conditional_mod op,
                                const struct brw_reg &x,
                                const struct brw_reg &y)
 {
-   fs_inst *cmp = new (mem_ctx) fs_inst(BRW_OPCODE_CMP,
+   fs_inst *cmp = new (mem_ctx) fs_inst(BRW_OPCODE_CMP, 16,
                                         vec16(brw_null_reg()), x, y);
    cmp->conditional_mod = op;
    insts.push_tail(cmp);
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
index f1f230e3751..91d53eff5a7 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
@@ -208,6 +208,7 @@ cfg_t::cfg_t(exec_list *instructions)
          cur_else = cur;
 
 	 next = new_block();
+         assert(cur_if != NULL);
 	 cur_if->add_successor(mem_ctx, next);
 
 	 set_next_block(&cur, next, ip);
@@ -274,6 +275,7 @@ cfg_t::cfg_t(exec_list *instructions)
          inst->exec_node::remove();
          cur->instructions.push_tail(inst);
 
+         assert(cur_do != NULL);
 	 cur->add_successor(mem_ctx, cur_do);
 
 	 next = new_block();
@@ -287,6 +289,7 @@ cfg_t::cfg_t(exec_list *instructions)
          inst->exec_node::remove();
          cur->instructions.push_tail(inst);
 
+         assert(cur_while != NULL);
 	 cur->add_successor(mem_ctx, cur_while);
 
 	 next = new_block();
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index 1d4ba3cac7e..f981388ef1a 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -184,7 +184,7 @@ brw_fast_clear_depth(struct gl_context *ctx)
     *      must be issued before the rectangle primitive used for the depth
     *      buffer clear operation.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (fb->MaxNumLayers > 0) {
       for (unsigned layer = 0; layer < depth_irb->layer_count; layer++) {
@@ -204,7 +204,7 @@ brw_fast_clear_depth(struct gl_context *ctx)
        *      by a PIPE_CONTROL command with DEPTH_STALL bit set and Then
        *      followed by Depth FLUSH'
       */
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
    }
 
    /* Now, the HiZ buffer contains data that needs to be resolved to the depth
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index ebf12fab69e..328662da82e 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -506,6 +506,18 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers = BRW_MAX_ABO;
       ctx->Const.Program[MESA_SHADER_COMPUTE].MaxAtomicBuffers = BRW_MAX_ABO;
       ctx->Const.MaxCombinedAtomicBuffers = 3 * BRW_MAX_ABO;
+
+      ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms =
+         BRW_MAX_IMAGES;
+      ctx->Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms =
+         (brw->intelScreen->compiler->scalar_vs ? BRW_MAX_IMAGES : 0);
+      ctx->Const.Program[MESA_SHADER_COMPUTE].MaxImageUniforms =
+         BRW_MAX_IMAGES;
+      ctx->Const.MaxImageUnits = MAX_IMAGE_UNITS;
+      ctx->Const.MaxCombinedImageUnitsAndFragmentOutputs =
+         MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
+      ctx->Const.MaxImageSamples = 0;
+      ctx->Const.MaxCombinedImageUniforms = 3 * BRW_MAX_IMAGES;
    }
 
    /* Gen6 converts quads to polygon in beginning of 3D pipeline,
@@ -716,6 +728,7 @@ brwCreateContext(gl_api api,
    brw->is_baytrail = devinfo->is_baytrail;
    brw->is_haswell = devinfo->is_haswell;
    brw->is_cherryview = devinfo->is_cherryview;
+   brw->is_broxton = devinfo->is_broxton;
    brw->has_llc = devinfo->has_llc;
    brw->has_hiz = devinfo->has_hiz_and_separate_stencil;
    brw->has_separate_stencil = devinfo->has_hiz_and_separate_stencil;
@@ -820,6 +833,12 @@ brwCreateContext(gl_api api,
       }
    }
 
+   if (brw_init_pipe_control(brw, devinfo)) {
+      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
+      intelDestroyContext(driContextPriv);
+      return false;
+   }
+
    brw_init_state(brw);
 #endif
 
@@ -867,6 +886,10 @@ brwCreateContext(gl_api api,
 
    brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
 
+   brw->use_resource_streamer = screen->has_resource_streamer &&
+      (brw_env_var_as_boolean("INTEL_USE_HW_BT", false) ||
+       brw_env_var_as_boolean("INTEL_USE_GATHER", false));
+
    ctx->VertexProgram._MaintainTnlProgram = true;
    ctx->FragmentProgram._MaintainTexEnvProgram = true;
 
@@ -935,6 +958,10 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    if (brw->wm.base.scratch_bo)
       drm_intel_bo_unreference(brw->wm.base.scratch_bo);
 
+   gen7_reset_hw_bt_pool_offsets(brw);
+   drm_intel_bo_unreference(brw->hw_bt_pool.bo);
+   brw->hw_bt_pool.bo = NULL;
+
    drm_intel_gem_context_destroy(brw->hw_ctx);
 
    if (ctx->swrast_context) {
@@ -946,6 +973,7 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    if (ctx->swrast_context)
       _swrast_DestroyContext(&brw->ctx);
 
+   brw_fini_pipe_control(brw);
    intel_batchbuffer_free(brw);
 
    drm_intel_bo_unreference(brw->throttle_batch[1]);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 9e1f722df9e..1267a6f5a97 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -201,6 +201,7 @@ enum brw_state_id {
    BRW_STATE_STATS_WM,
    BRW_STATE_UNIFORM_BUFFER,
    BRW_STATE_ATOMIC_BUFFER,
+   BRW_STATE_IMAGE_UNITS,
    BRW_STATE_META_IN_PROGRESS,
    BRW_STATE_INTERPOLATION_MAP,
    BRW_STATE_PUSH_CONSTANT_ALLOCATION,
@@ -282,6 +283,7 @@ enum brw_state_id {
 #define BRW_NEW_STATS_WM                (1ull << BRW_STATE_STATS_WM)
 #define BRW_NEW_UNIFORM_BUFFER          (1ull << BRW_STATE_UNIFORM_BUFFER)
 #define BRW_NEW_ATOMIC_BUFFER           (1ull << BRW_STATE_ATOMIC_BUFFER)
+#define BRW_NEW_IMAGE_UNITS             (1ull << BRW_STATE_IMAGE_UNITS)
 #define BRW_NEW_META_IN_PROGRESS        (1ull << BRW_STATE_META_IN_PROGRESS)
 #define BRW_NEW_INTERPOLATION_MAP       (1ull << BRW_STATE_INTERPOLATION_MAP)
 #define BRW_NEW_PUSH_CONSTANT_ALLOCATION (1ull << BRW_STATE_PUSH_CONSTANT_ALLOCATION)
@@ -367,6 +369,7 @@ struct brw_stage_prog_data {
 
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
+   unsigned nr_image_params;
 
    unsigned curb_read_length;
    unsigned total_scratch;
@@ -387,6 +390,59 @@ struct brw_stage_prog_data {
     */
    const gl_constant_value **param;
    const gl_constant_value **pull_param;
+
+   /**
+    * Image metadata passed to the shader as uniforms.  This is deliberately
+    * ignored by brw_stage_prog_data_compare() because its contents don't have
+    * any influence on program compilation.
+    */
+   struct brw_image_param *image_param;
+};
+
+/*
+ * Image metadata structure as laid out in the shader parameter
+ * buffer.  Entries have to be 16B-aligned for the vec4 back-end to be
+ * able to use them.  That's okay because the padding and any unused
+ * entries [most of them except when we're doing untyped surface
+ * access] will be removed by the uniform packing pass.
+ */
+#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET      0
+#define BRW_IMAGE_PARAM_OFFSET_OFFSET           4
+#define BRW_IMAGE_PARAM_SIZE_OFFSET             8
+#define BRW_IMAGE_PARAM_STRIDE_OFFSET           12
+#define BRW_IMAGE_PARAM_TILING_OFFSET           16
+#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET        20
+#define BRW_IMAGE_PARAM_SIZE                    24
+
+struct brw_image_param {
+   /** Surface binding table index. */
+   uint32_t surface_idx;
+
+   /** Offset applied to the X and Y surface coordinates. */
+   uint32_t offset[2];
+
+   /** Surface X, Y and Z dimensions. */
+   uint32_t size[3];
+
+   /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in
+    * pixels, vertical slice stride in pixels.
+    */
+   uint32_t stride[4];
+
+   /** Log2 of the tiling modulus in the X, Y and Z dimension. */
+   uint32_t tiling[3];
+
+   /**
+    * Right shift to apply for bit 6 address swizzling.  Two different
+    * swizzles can be specified and will be applied one after the other.  The
+    * resulting address will be:
+    *
+    *  addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^
+    *                              (addr >> swizzling[1])))
+    *
+    * Use \c 0xff if any of the swizzles is not required.
+    */
+   uint32_t swizzling[2];
 };
 
 /* Data about a particular attempt to compile a program.  Note that
@@ -416,11 +472,13 @@ struct brw_wm_prog_data {
 
    uint8_t computed_depth_mode;
 
+   bool early_fragment_tests;
    bool no_8;
    bool dual_src_blend;
    bool uses_pos_offset;
    bool uses_omask;
    bool uses_kill;
+   bool pulls_bary;
    uint32_t prog_offset_16;
 
    /**
@@ -874,11 +932,12 @@ struct intel_batchbuffer {
    drm_intel_bo *bo;
    /** Last BO submitted to the hardware.  Used for glFinish(). */
    drm_intel_bo *last_bo;
-   /** BO for post-sync nonzero writes for gen6 workaround. */
-   drm_intel_bo *workaround_bo;
 
+#ifdef DEBUG
    uint16_t emit, total;
-   uint16_t used, reserved_space;
+#endif
+   uint16_t reserved_space;
+   uint32_t *map_next;
    uint32_t *map;
    uint32_t *cpu_map;
 #define BATCH_SZ (8192*sizeof(uint32_t))
@@ -887,10 +946,8 @@ struct intel_batchbuffer {
    enum brw_gpu_ring ring;
    bool needs_sol_reset;
 
-   uint8_t pipe_controls_since_last_cs_stall;
-
    struct {
-      uint16_t used;
+      uint32_t *map_next;
       int reloc_count;
    } saved;
 };
@@ -1040,6 +1097,10 @@ struct brw_context
 
    drm_intel_context *hw_ctx;
 
+   /** BO for post-sync nonzero writes for gen6 workaround. */
+   drm_intel_bo *workaround_bo;
+   uint8_t pipe_controls_since_last_cs_stall;
+
    /**
     * Set of drm_intel_bo * that have been rendered to within this batchbuffer
     * and would need flushing before being used from another cache domain that
@@ -1123,6 +1184,7 @@ struct brw_context
    bool is_baytrail;
    bool is_haswell;
    bool is_cherryview;
+   bool is_broxton;
 
    bool has_hiz;
    bool has_separate_stencil;
@@ -1135,6 +1197,7 @@ struct brw_context
    bool has_pln;
    bool no_simd8;
    bool use_rep_send;
+   bool use_resource_streamer;
 
    /**
     * Some versions of Gen hardware don't do centroid interpolation correctly
@@ -1241,12 +1304,12 @@ struct brw_context
     * Platform specific constants containing the maximum number of threads
     * for each pipeline stage.
     */
-   int max_vs_threads;
-   int max_hs_threads;
-   int max_ds_threads;
-   int max_gs_threads;
-   int max_wm_threads;
-   int max_cs_threads;
+   unsigned max_vs_threads;
+   unsigned max_hs_threads;
+   unsigned max_ds_threads;
+   unsigned max_gs_threads;
+   unsigned max_wm_threads;
+   unsigned max_cs_threads;
 
    /* BRW_NEW_URB_ALLOCATIONS:
     */
@@ -1398,6 +1461,12 @@ struct brw_context
       struct brw_cs_prog_data *prog_data;
    } cs;
 
+   /* RS hardware binding table */
+   struct {
+      drm_intel_bo *bo;
+      uint32_t next_offset;
+   } hw_bt_pool;
+
    struct {
       uint32_t state_offset;
       uint32_t blend_state_offset;
@@ -1453,8 +1522,8 @@ struct brw_context
    } perfmon;
 
    int num_atoms[BRW_NUM_PIPELINES];
-   const struct brw_tracked_state render_atoms[57];
-   const struct brw_tracked_state compute_atoms[3];
+   const struct brw_tracked_state render_atoms[60];
+   const struct brw_tracked_state compute_atoms[4];
 
    /* If (INTEL_DEBUG & DEBUG_BATCH) */
    struct {
@@ -1732,11 +1801,17 @@ void brw_upload_abo_surfaces(struct brw_context *brw,
                              struct gl_shader_program *prog,
                              struct brw_stage_state *stage_state,
                              struct brw_stage_prog_data *prog_data);
+void brw_upload_image_surfaces(struct brw_context *brw,
+                               struct gl_shader *shader,
+                               struct brw_stage_state *stage_state,
+                               struct brw_stage_prog_data *prog_data);
 
 /* brw_surface_formats.c */
 bool brw_render_target_supported(struct brw_context *brw,
                                  struct gl_renderbuffer *rb);
 uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
+mesa_format brw_lower_mesa_image_format(const struct brw_device_info *devinfo,
+                                        mesa_format format);
 
 /* brw_performance_monitor.c */
 void brw_init_performance_monitors(struct brw_context *brw);
@@ -2013,6 +2088,21 @@ bool
 gen9_use_linear_1d_layout(const struct brw_context *brw,
                           const struct intel_mipmap_tree *mt);
 
+/* brw_pipe_control.c */
+int brw_init_pipe_control(struct brw_context *brw,
+			  const struct brw_device_info *info);
+void brw_fini_pipe_control(struct brw_context *brw);
+
+void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
+void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
+                                 drm_intel_bo *bo, uint32_t offset,
+                                 uint32_t imm_lower, uint32_t imm_upper);
+void brw_emit_mi_flush(struct brw_context *brw);
+void brw_emit_post_sync_nonzero_flush(struct brw_context *brw);
+void brw_emit_depth_stall_flushes(struct brw_context *brw);
+void gen7_emit_vs_workaround_flush(struct brw_context *brw);
+void gen7_emit_cs_stall_flush(struct brw_context *brw);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
index 42a082b57b6..6ce5779137e 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -82,7 +82,7 @@ brw_cs_emit(struct brw_context *brw,
    prog_data->local_size[0] = cp->LocalSize[0];
    prog_data->local_size[1] = cp->LocalSize[1];
    prog_data->local_size[2] = cp->LocalSize[2];
-   int local_workgroup_size =
+   unsigned local_workgroup_size =
       cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
 
    cfg_t *cfg = NULL;
@@ -182,7 +182,8 @@ brw_codegen_cs_prog(struct brw_context *brw,
     * prog_data associated with the compiled program, and which will be freed
     * by the state cache.
     */
-   int param_count = cs->num_uniform_components;
+   int param_count = cs->num_uniform_components +
+                     cs->NumImages * BRW_IMAGE_PARAM_SIZE;
 
    /* The backend also sometimes adds params for texture size. */
    param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
@@ -190,7 +191,10 @@ brw_codegen_cs_prog(struct brw_context *brw,
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    prog_data.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, cs->NumImages);
    prog_data.base.nr_params = param_count;
+   prog_data.base.nr_image_params = cs->NumImages;
 
    program = brw_cs_emit(brw, mem_ctx, key, &prog_data,
                          &cp->program, prog, &program_size);
@@ -291,6 +295,17 @@ brw_cs_precompile(struct gl_context *ctx,
 }
 
 
+static unsigned
+get_cs_thread_count(const struct brw_cs_prog_data *cs_prog_data)
+{
+   const unsigned simd_size = cs_prog_data->simd_size;
+   unsigned group_size = cs_prog_data->local_size[0] *
+      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
+
+   return (group_size + simd_size - 1) / simd_size;
+}
+
+
 static void
 brw_upload_cs_state(struct brw_context *brw)
 {
@@ -316,6 +331,8 @@ brw_upload_cs_state(struct brw_context *brw)
                                             prog_data->binding_table.size_bytes,
                                             32, &stage_state->bind_bo_offset);
 
+   unsigned threads = get_cs_thread_count(cs_prog_data);
+
    uint32_t dwords = brw->gen < 8 ? 8 : 9;
    BEGIN_BATCH(dwords);
    OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));
@@ -365,6 +382,13 @@ brw_upload_cs_state(struct brw_context *brw)
    desc[dw++] = 0;
    desc[dw++] = 0;
    desc[dw++] = stage_state->bind_bo_offset;
+   desc[dw++] = 0;
+   const uint32_t media_threads =
+      brw->gen >= 8 ?
+      SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
+      SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT);
+   assert(threads <= brw->max_cs_threads);
+   desc[dw++] = media_threads;
 
    BEGIN_BATCH(4);
    OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index befd7a9538c..a149ce3ba12 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -176,7 +176,7 @@ void brw_upload_cs_urb_state(struct brw_context *brw)
    ADVANCE_BATCH();
 }
 
-static GLfloat fixed_plane[6][4] = {
+static const GLfloat fixed_plane[6][4] = {
    { 0,    0,   -1, 1 },
    { 0,    0,    1, 1 },
    { 0,   -1,    0, 1 },
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index c113d52a3d3..3bbaf977bc5 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -877,6 +877,21 @@ enum opcode {
     * instructions.
     */
    FS_OPCODE_FB_WRITE = 128,
+
+   /**
+    * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as
+    * individual sources instead of as a single payload blob:
+    *
+    * Source 0: [required] Color 0.
+    * Source 1: [optional] Color 1 (for dual source blend messages).
+    * Source 2: [optional] Src0 Alpha.
+    * Source 3: [optional] Source Depth (passthrough from the thread payload).
+    * Source 4: [optional] Destination Depth (gl_FragDepth).
+    * Source 5: [optional] Sample Mask (gl_SampleMask).
+    * Source 6: [required] Number of color components (as a UD immediate).
+    */
+   FS_OPCODE_FB_WRITE_LOGICAL,
+
    FS_OPCODE_BLORP_FB_WRITE,
    FS_OPCODE_REP_FB_WRITE,
    SHADER_OPCODE_RCP,
@@ -890,18 +905,49 @@ enum opcode {
    SHADER_OPCODE_SIN,
    SHADER_OPCODE_COS,
 
+   /**
+    * Texture sampling opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources:
+    *
+    * Source 0: [optional] Texture coordinates.
+    * Source 1: [optional] Shadow comparitor.
+    * Source 2: [optional] dPdx if the operation takes explicit derivatives,
+    *                      otherwise LOD value.
+    * Source 3: [optional] dPdy if the operation takes explicit derivatives.
+    * Source 4: [optional] Sample index.
+    * Source 5: [optional] MCS data.
+    * Source 6: [required] Texture sampler.
+    * Source 7: [optional] Texel offset.
+    * Source 8: [required] Number of coordinate components (as UD immediate).
+    * Source 9: [required] Number derivative components (as UD immediate).
+    */
    SHADER_OPCODE_TEX,
+   SHADER_OPCODE_TEX_LOGICAL,
    SHADER_OPCODE_TXD,
+   SHADER_OPCODE_TXD_LOGICAL,
    SHADER_OPCODE_TXF,
+   SHADER_OPCODE_TXF_LOGICAL,
    SHADER_OPCODE_TXL,
+   SHADER_OPCODE_TXL_LOGICAL,
    SHADER_OPCODE_TXS,
+   SHADER_OPCODE_TXS_LOGICAL,
    FS_OPCODE_TXB,
+   FS_OPCODE_TXB_LOGICAL,
    SHADER_OPCODE_TXF_CMS,
+   SHADER_OPCODE_TXF_CMS_LOGICAL,
    SHADER_OPCODE_TXF_UMS,
+   SHADER_OPCODE_TXF_UMS_LOGICAL,
    SHADER_OPCODE_TXF_MCS,
+   SHADER_OPCODE_TXF_MCS_LOGICAL,
    SHADER_OPCODE_LOD,
+   SHADER_OPCODE_LOD_LOGICAL,
    SHADER_OPCODE_TG4,
+   SHADER_OPCODE_TG4_LOGICAL,
    SHADER_OPCODE_TG4_OFFSET,
+   SHADER_OPCODE_TG4_OFFSET_LOGICAL,
 
    /**
     * Combines multiple sources of size 1 into a larger virtual GRF.
@@ -919,13 +965,33 @@ enum opcode {
 
    SHADER_OPCODE_SHADER_TIME_ADD,
 
+   /**
+    * Typed and untyped surface access opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources:
+    *
+    * Source 0: [required] Surface coordinates.
+    * Source 1: [optional] Operation source.
+    * Source 2: [required] Surface index.
+    * Source 3: [required] Number of coordinate components (as UD immediate).
+    * Source 4: [required] Opcode-specific control immediate, same as source 2
+    *                      of the matching non-LOGICAL opcode.
+    */
    SHADER_OPCODE_UNTYPED_ATOMIC,
+   SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+   SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+   SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
 
    SHADER_OPCODE_TYPED_ATOMIC,
+   SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
    SHADER_OPCODE_TYPED_SURFACE_READ,
+   SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+   SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
 
    SHADER_OPCODE_MEMORY_FENCE,
 
@@ -971,7 +1037,6 @@ enum opcode {
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
    FS_OPCODE_DISCARD_JUMP,
-   FS_OPCODE_SET_OMASK,
    FS_OPCODE_SET_SAMPLE_ID,
    FS_OPCODE_SET_SIMD4X2_OFFSET,
    FS_OPCODE_PACK_HALF_2x16_SPLIT,
@@ -1151,6 +1216,11 @@ enum opcode {
     * GLSL barrier()
     */
    SHADER_OPCODE_BARRIER,
+
+   /**
+    * Calculate the high 32-bits of a 32x32 multiply.
+    */
+   SHADER_OPCODE_MULH,
 };
 
 enum brw_urb_write_flags {
@@ -1642,6 +1712,36 @@ enum brw_message_target {
 #define _3DSTATE_BINDING_TABLE_POINTERS_GS	0x7829 /* GEN7+ */
 #define _3DSTATE_BINDING_TABLE_POINTERS_PS	0x782A /* GEN7+ */
 
+#define _3DSTATE_BINDING_TABLE_POOL_ALLOC       0x7919 /* GEN7.5+ */
+#define BRW_HW_BINDING_TABLE_ENABLE             (1 << 11)
+#define GEN7_HW_BT_POOL_MOCS_SHIFT              7
+#define GEN7_HW_BT_POOL_MOCS_MASK               INTEL_MASK(10, 7)
+#define GEN8_HW_BT_POOL_MOCS_SHIFT              0
+#define GEN8_HW_BT_POOL_MOCS_MASK               INTEL_MASK(6, 0)
+/* Only required in HSW */
+#define HSW_BT_POOL_ALLOC_MUST_BE_ONE           (3 << 5)
+
+#define _3DSTATE_BINDING_TABLE_EDIT_VS          0x7843 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_GS          0x7844 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_HS          0x7845 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_DS          0x7846 /* GEN7.5 */
+#define _3DSTATE_BINDING_TABLE_EDIT_PS          0x7847 /* GEN7.5 */
+#define BRW_BINDING_TABLE_INDEX_SHIFT           16
+#define BRW_BINDING_TABLE_INDEX_MASK            INTEL_MASK(23, 16)
+
+#define BRW_BINDING_TABLE_EDIT_TARGET_ALL       3
+#define BRW_BINDING_TABLE_EDIT_TARGET_CORE1     2
+#define BRW_BINDING_TABLE_EDIT_TARGET_CORE0     1
+/* In HSW, when editing binding table entries to surface state offsets,
+ * the surface state offset is a 16-bit value aligned to 32 bytes. But
+ * Surface State Pointer in dword 2 is [15:0]. Right shift surf_offset
+ * by 5 bits so it won't disturb bit 16 (which is used as the binding
+ * table index entry), otherwise it would hang the GPU.
+ */
+#define HSW_SURFACE_STATE_EDIT(value)           (value >> 5)
+/* Same as Haswell, but surface state offsets now aligned to 64 bytes.*/
+#define GEN8_SURFACE_STATE_EDIT(value)          (value >> 6)
+
 #define _3DSTATE_SAMPLER_STATE_POINTERS		0x7802 /* GEN6+ */
 # define PS_SAMPLER_STATE_CHANGE				(1 << 12)
 # define GS_SAMPLER_STATE_CHANGE				(1 << 9)
@@ -1757,6 +1857,7 @@ enum brw_message_target {
 # define GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
 # define GEN6_VS_FLOATING_POINT_MODE_IEEE_754		(0 << 16)
 # define GEN6_VS_FLOATING_POINT_MODE_ALT		(1 << 16)
+# define HSW_VS_UAV_ACCESS_ENABLE                       (1 << 12)
 /* DW4 */
 # define GEN6_VS_DISPATCH_START_GRF_SHIFT		20
 # define GEN6_VS_URB_READ_LENGTH_SHIFT			11
@@ -1782,6 +1883,7 @@ enum brw_message_target {
 # define GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
 # define GEN6_GS_FLOATING_POINT_MODE_IEEE_754		(0 << 16)
 # define GEN6_GS_FLOATING_POINT_MODE_ALT		(1 << 16)
+# define HSW_GS_UAV_ACCESS_ENABLE       		(1 << 12)
 /* DW4 */
 # define GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT		23
 # define GEN7_GS_OUTPUT_TOPOLOGY_SHIFT			17
@@ -2147,6 +2249,7 @@ enum brw_pixel_shader_computed_depth_mode {
 # define GEN8_PSX_SHADER_DISABLES_ALPHA_TO_COVERAGE     (1 << 7)
 # define GEN8_PSX_SHADER_IS_PER_SAMPLE                  (1 << 6)
 # define GEN8_PSX_SHADER_COMPUTES_STENCIL               (1 << 5)
+# define GEN9_PSX_SHADER_PULLS_BARY                     (1 << 3)
 # define GEN8_PSX_SHADER_HAS_UAV                        (1 << 2)
 # define GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK       (1 << 1)
 
@@ -2283,6 +2386,9 @@ enum brw_wm_barycentric_interp_mode {
 # define GEN7_WM_KILL_ENABLE				(1 << 25)
 # define GEN7_WM_COMPUTED_DEPTH_MODE_SHIFT              23
 # define GEN7_WM_USES_SOURCE_DEPTH			(1 << 20)
+# define GEN7_WM_EARLY_DS_CONTROL_NORMAL                (0 << 21)
+# define GEN7_WM_EARLY_DS_CONTROL_PSEXEC                (1 << 21)
+# define GEN7_WM_EARLY_DS_CONTROL_PREPS                 (2 << 21)
 # define GEN7_WM_USES_SOURCE_W			        (1 << 19)
 # define GEN7_WM_POSITION_ZW_PIXEL			(0 << 17)
 # define GEN7_WM_POSITION_ZW_CENTROID			(2 << 17)
@@ -2307,6 +2413,7 @@ enum brw_wm_barycentric_interp_mode {
 /* DW2 */
 # define GEN7_WM_MSDISPMODE_PERSAMPLE			(0 << 31)
 # define GEN7_WM_MSDISPMODE_PERPIXEL			(1 << 31)
+# define HSW_WM_UAV_ONLY                                (1 << 30)
 
 #define _3DSTATE_PS				0x7820 /* GEN7+ */
 /* DW1: kernel pointer */
@@ -2330,6 +2437,7 @@ enum brw_wm_barycentric_interp_mode {
 # define GEN7_PS_RENDER_TARGET_FAST_CLEAR_ENABLE	(1 << 8)
 # define GEN7_PS_DUAL_SOURCE_BLEND_ENABLE		(1 << 7)
 # define GEN7_PS_RENDER_TARGET_RESOLVE_ENABLE		(1 << 6)
+# define HSW_PS_UAV_ACCESS_ENABLE			(1 << 5)
 # define GEN7_PS_POSOFFSET_NONE				(0 << 3)
 # define GEN7_PS_POSOFFSET_CENTROID			(2 << 3)
 # define GEN7_PS_POSOFFSET_SAMPLE			(3 << 3)
@@ -2493,12 +2601,13 @@ enum brw_wm_barycentric_interp_mode {
 #define BDW_MOCS_WT  0x58
 #define BDW_MOCS_PTE 0x18
 
-/* Skylake: MOCS is now an index into an array of 64 different configurable
- * cache settings.  We still use only either write-back or write-through; and
- * rely on the documented default values.
+/* Skylake: MOCS is now an index into an array of 62 different caching
+ * configurations programmed by the kernel.
  */
-#define SKL_MOCS_WB (0b001001 << 1)
-#define SKL_MOCS_WT (0b000101 << 1)
+/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
+#define SKL_MOCS_WB  (2 << 1)
+/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
+#define SKL_MOCS_PTE (1 << 1)
 
 #define MEDIA_VFE_STATE                         0x7000
 /* GEN7 DW2, GEN8+ DW3 */
@@ -2519,6 +2628,11 @@ enum brw_wm_barycentric_interp_mode {
 # define MEDIA_VFE_STATE_CURBE_ALLOC_MASK       INTEL_MASK(15, 0)
 
 #define MEDIA_INTERFACE_DESCRIPTOR_LOAD         0x7002
+/* GEN7 DW5, GEN8+ DW6 */
+# define MEDIA_GPGPU_THREAD_COUNT_SHIFT         0
+# define MEDIA_GPGPU_THREAD_COUNT_MASK          INTEL_MASK(7, 0)
+# define GEN8_MEDIA_GPGPU_THREAD_COUNT_SHIFT    0
+# define GEN8_MEDIA_GPGPU_THREAD_COUNT_MASK     INTEL_MASK(9, 0)
 #define MEDIA_STATE_FLUSH                       0x7004
 #define GPGPU_WALKER                            0x7105
 /* GEN8+ DW2 */
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index a07b86e60e2..16c125d07ee 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -170,7 +170,8 @@ static const struct brw_device_info brw_device_info_byt = {
 #define HSW_FEATURES             \
    GEN7_FEATURES,                \
    .is_haswell = true,           \
-   .supports_simd16_3src = true
+   .supports_simd16_3src = true, \
+   .has_resource_streamer = true
 
 static const struct brw_device_info brw_device_info_hsw_gt1 = {
    HSW_FEATURES, .gt = 1,
@@ -229,6 +230,7 @@ static const struct brw_device_info brw_device_info_hsw_gt3 = {
 #define GEN8_FEATURES                               \
    .gen = 8,                                        \
    .has_hiz_and_separate_stencil = true,            \
+   .has_resource_streamer = true,                   \
    .must_use_separate_stencil = true,               \
    .has_llc = true,                                 \
    .has_pln = true,                                 \
@@ -297,41 +299,62 @@ static const struct brw_device_info brw_device_info_chv = {
    }
 };
 
-/* Thread counts and URB limits are placeholders, and may not be accurate. */
 #define GEN9_FEATURES                               \
    .gen = 9,                                        \
    .has_hiz_and_separate_stencil = true,            \
+   .has_resource_streamer = true,                   \
    .must_use_separate_stencil = true,               \
    .has_llc = true,                                 \
    .has_pln = true,                                 \
-   .max_vs_threads = 280,                           \
-   .max_gs_threads = 256,                           \
-   .max_wm_threads = 408,                           \
+   .supports_simd16_3src = true,                    \
+   .max_vs_threads = 336,                           \
+   .max_gs_threads = 336,                           \
+   .max_hs_threads = 336,                           \
+   .max_ds_threads = 336,                           \
+   .max_wm_threads = 64 * 6,                        \
+   .max_cs_threads = 56,                            \
    .urb = {                                         \
-      .size = 128,                                  \
+      .size = 192,                                  \
       .min_vs_entries = 64,                         \
-      .max_vs_entries = 1664,                       \
+      .max_vs_entries = 1856,                       \
+      .max_hs_entries = 672,                        \
+      .max_ds_entries = 1120,                       \
       .max_gs_entries = 640,                        \
    }
 
-static const struct brw_device_info brw_device_info_skl_early = {
-   GEN9_FEATURES, .gt = 1,
-   .supports_simd16_3src = false,
-};
-
 static const struct brw_device_info brw_device_info_skl_gt1 = {
    GEN9_FEATURES, .gt = 1,
-   .supports_simd16_3src = true,
 };
 
 static const struct brw_device_info brw_device_info_skl_gt2 = {
    GEN9_FEATURES, .gt = 2,
-   .supports_simd16_3src = true,
 };
 
 static const struct brw_device_info brw_device_info_skl_gt3 = {
    GEN9_FEATURES, .gt = 3,
-   .supports_simd16_3src = true,
+};
+
+static const struct brw_device_info brw_device_info_bxt = {
+   GEN9_FEATURES,
+   .is_broxton = 1,
+   .gt = 1,
+   .has_llc = false,
+
+   /* XXX: These are preliminary thread counts and URB sizes. */
+   .max_vs_threads = 56,
+   .max_hs_threads = 56,
+   .max_ds_threads = 56,
+   .max_gs_threads = 56,
+   .max_wm_threads = 32,
+   .max_cs_threads = 28,
+   .urb = {
+      .size = 64,
+      .min_vs_entries = 34,
+      .max_vs_entries = 640,
+      .max_hs_entries = 80,
+      .max_ds_entries = 80,
+      .max_gs_entries = 256,
+   }
 };
 
 const struct brw_device_info *
@@ -348,9 +371,6 @@ brw_get_device_info(int devid, int revision)
       return NULL;
    }
 
-   if (devinfo->gen == 9 && (revision == 2 || revision == 3 || revision == -1))
-      return &brw_device_info_skl_early;
-
    return devinfo;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h
index 9192235fb0e..7bab5716b43 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -35,6 +35,7 @@ struct brw_device_info
    bool is_baytrail;
    bool is_haswell;
    bool is_cherryview;
+   bool is_broxton;
 
    bool has_hiz_and_separate_stencil;
    bool must_use_separate_stencil;
@@ -45,6 +46,7 @@ struct brw_device_info
    bool has_compr4;
    bool has_surface_tile_offset;
    bool supports_simd16_3src;
+   bool has_resource_streamer;
 
    /**
     * Quirks:
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index b91597a9f5d..e092ef4a7c6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -104,13 +104,13 @@ get_hw_prim_for_gl_prim(int mode)
  * programs be immune to the active primitive (ie. cope with all
  * possibilities).  That may not be realistic however.
  */
-static void brw_set_prim(struct brw_context *brw,
-                         const struct _mesa_prim *prim)
+static void
+brw_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
 
-   DBG("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));
+   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));
 
    /* Slight optimization to avoid the GS program when not needed:
     */
@@ -138,15 +138,12 @@ static void brw_set_prim(struct brw_context *brw,
    }
 }
 
-static void gen6_set_prim(struct brw_context *brw,
-                          const struct _mesa_prim *prim)
+static void
+gen6_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
 {
-   uint32_t hw_prim;
-
-   DBG("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim->mode));
-
-   hw_prim = get_hw_prim_for_gl_prim(prim->mode);
+   DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));
 
+   const uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
    if (hw_prim != brw->primitive) {
       brw->primitive = hw_prim;
       brw->ctx.NewDriverState |= BRW_NEW_PRIMITIVE;
@@ -162,7 +159,8 @@ static void gen6_set_prim(struct brw_context *brw,
  * quads so that those dangling vertices won't get drawn when we convert to
  * trifans/tristrips.
  */
-static GLuint trim(GLenum prim, GLuint length)
+static GLuint
+trim(GLenum prim, GLuint length)
 {
    if (prim == GL_QUAD_STRIP)
       return length > 3 ? (length - length % 2) : 0;
@@ -173,16 +171,16 @@ static GLuint trim(GLenum prim, GLuint length)
 }
 
 
-static void brw_emit_prim(struct brw_context *brw,
-			  const struct _mesa_prim *prim,
-			  uint32_t hw_prim)
+static void
+brw_emit_prim(struct brw_context *brw,
+              const struct _mesa_prim *prim,
+              uint32_t hw_prim)
 {
    int verts_per_instance;
    int vertex_access_type;
    int indirect_flag;
-   int predicate_enable;
 
-   DBG("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode),
+   DBG("PRIM: %s %d %d\n", _mesa_enum_to_string(prim->mode),
        prim->start, prim->count);
 
    int start_vertex_location = prim->start;
@@ -216,9 +214,8 @@ static void brw_emit_prim(struct brw_context *brw,
     * and missed flushes of the render cache as it heads to other parts of
     * the besides the draw code.
     */
-   if (brw->always_flush_cache) {
-      intel_batchbuffer_emit_mi_flush(brw);
-   }
+   if (brw->always_flush_cache)
+      brw_emit_mi_flush(brw);
 
    /* If indirect, emit a bunch of loads from the indirect BO. */
    if (prim->is_indirect) {
@@ -256,22 +253,20 @@ static void brw_emit_prim(struct brw_context *brw,
          OUT_BATCH(0);
          ADVANCE_BATCH();
       }
-   }
-   else {
+   } else {
       indirect_flag = 0;
    }
 
-   if (brw->gen >= 7) {
-      if (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
-         predicate_enable = GEN7_3DPRIM_PREDICATE_ENABLE;
-      else
-         predicate_enable = 0;
+   BEGIN_BATCH(brw->gen >= 7 ? 7 : 6);
+
+   if (brw->gen >= 7) {
+      const int predicate_enable =
+         (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
+         ? GEN7_3DPRIM_PREDICATE_ENABLE : 0;
 
-      BEGIN_BATCH(7);
       OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable);
       OUT_BATCH(hw_prim | vertex_access_type);
    } else {
-      BEGIN_BATCH(6);
       OUT_BATCH(CMD_3D_PRIM << 16 | (6 - 2) |
                 hw_prim << GEN4_3DPRIM_TOPOLOGY_TYPE_SHIFT |
                 vertex_access_type);
@@ -283,14 +278,14 @@ static void brw_emit_prim(struct brw_context *brw,
    OUT_BATCH(base_vertex_location);
    ADVANCE_BATCH();
 
-   if (brw->always_flush_cache) {
-      intel_batchbuffer_emit_mi_flush(brw);
-   }
+   if (brw->always_flush_cache)
+      brw_emit_mi_flush(brw);
 }
 
 
-static void brw_merge_inputs( struct brw_context *brw,
-		       const struct gl_client_array *arrays[])
+static void
+brw_merge_inputs(struct brw_context *brw,
+                 const struct gl_client_array *arrays[])
 {
    const struct gl_context *ctx = &brw->ctx;
    GLuint i;
@@ -359,7 +354,8 @@ static void brw_merge_inputs( struct brw_context *brw,
  * Also mark any render targets which will be textured as needing a render
  * cache flush.
  */
-static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
+static void
+brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
@@ -399,21 +395,22 @@ static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
 /* May fail if out of video memory for texture or vbo upload, or on
  * fallback conditions.
  */
-static void brw_try_draw_prims( struct gl_context *ctx,
-				     const struct gl_client_array *arrays[],
-				     const struct _mesa_prim *prims,
-				     GLuint nr_prims,
-				     const struct _mesa_index_buffer *ib,
-				     GLuint min_index,
-				     GLuint max_index,
-				     struct gl_buffer_object *indirect)
+static void
+brw_try_draw_prims(struct gl_context *ctx,
+                   const struct gl_client_array *arrays[],
+                   const struct _mesa_prim *prims,
+                   GLuint nr_prims,
+                   const struct _mesa_index_buffer *ib,
+                   GLuint min_index,
+                   GLuint max_index,
+                   struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
    GLuint i;
    bool fail_next = false;
 
    if (ctx->NewState)
-      _mesa_update_state( ctx );
+      _mesa_update_state(ctx);
 
    /* Find the highest sampler unit used by each shader program.  A bit-count
     * won't work since ARB programs use the texture unit number as the sampler
@@ -433,7 +430,7 @@ static void brw_try_draw_prims( struct gl_context *ctx,
     * software fallback will segfault if it attempts to access any
     * texture level other than level 0.
     */
-   brw_validate_textures( brw );
+   brw_validate_textures(brw);
 
    intel_prepare_render(brw);
 
@@ -445,7 +442,7 @@ static void brw_try_draw_prims( struct gl_context *ctx,
 
    /* Bind all inputs, derive varying and size information:
     */
-   brw_merge_inputs( brw, arrays );
+   brw_merge_inputs(brw, arrays);
 
    brw->ib.ib = ib;
    brw->ctx.NewDriverState |= BRW_NEW_INDICES;
@@ -553,15 +550,17 @@ retry:
    return;
 }
 
-void brw_draw_prims( struct gl_context *ctx,
-		     const struct _mesa_prim *prims,
-		     GLuint nr_prims,
-		     const struct _mesa_index_buffer *ib,
-		     GLboolean index_bounds_valid,
-		     GLuint min_index,
-		     GLuint max_index,
-		     struct gl_transform_feedback_object *unused_tfb_object,
-		     struct gl_buffer_object *indirect )
+void
+brw_draw_prims(struct gl_context *ctx,
+               const struct _mesa_prim *prims,
+               GLuint nr_prims,
+               const struct _mesa_index_buffer *ib,
+               GLboolean index_bounds_valid,
+               GLuint min_index,
+               GLuint max_index,
+               struct gl_transform_feedback_object *unused_tfb_object,
+               unsigned stream,
+               struct gl_buffer_object *indirect)
 {
    struct brw_context *brw = brw_context(ctx);
    const struct gl_client_array **arrays = ctx->Array._DrawArrays;
@@ -582,11 +581,11 @@ void brw_draw_prims( struct gl_context *ctx,
     */
    if (ctx->RenderMode != GL_RENDER) {
       perf_debug("%s render mode not supported in hardware\n",
-                 _mesa_lookup_enum_by_nr(ctx->RenderMode));
+                 _mesa_enum_to_string(ctx->RenderMode));
       _swsetup_Wakeup(ctx);
       _tnl_wakeup(ctx);
       _tnl_draw_prims(ctx, prims, nr_prims, ib,
-                      index_bounds_valid, min_index, max_index, NULL, NULL);
+                      index_bounds_valid, min_index, max_index, NULL, 0, NULL);
       return;
    }
 
@@ -604,26 +603,28 @@ void brw_draw_prims( struct gl_context *ctx,
     * manage it.  swrast doesn't support our featureset, so we can't fall back
     * to it.
     */
-   brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index, indirect);
+   brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index,
+                      indirect);
 }
 
-void brw_draw_init( struct brw_context *brw )
+void
+brw_draw_init(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    struct vbo_context *vbo = vbo_context(ctx);
-   int i;
 
    /* Register our drawing function:
     */
    vbo->draw_prims = brw_draw_prims;
 
-   for (i = 0; i < VERT_ATTRIB_MAX; i++)
+   for (int i = 0; i < VERT_ATTRIB_MAX; i++)
       brw->vb.inputs[i].buffer = -1;
    brw->vb.nr_buffers = 0;
    brw->vb.nr_enabled = 0;
 }
 
-void brw_draw_destroy( struct brw_context *brw )
+void
+brw_draw_destroy(struct brw_context *brw)
 {
    int i;
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw.h b/src/mesa/drivers/dri/i965/brw_draw.h
index fc83dcdd0bb..f994726f5b6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.h
+++ b/src/mesa/drivers/dri/i965/brw_draw.h
@@ -34,7 +34,7 @@
 struct brw_context;
 
 
-void brw_draw_prims( struct gl_context *ctx,
+void brw_draw_prims(struct gl_context *ctx,
 		     const struct _mesa_prim *prims,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
@@ -42,6 +42,7 @@ void brw_draw_prims( struct gl_context *ctx,
 		     GLuint min_index,
 		     GLuint max_index,
 		     struct gl_transform_feedback_object *unused_tfb_object,
+                     unsigned stream,
 		     struct gl_buffer_object *indirect );
 
 void brw_draw_init( struct brw_context *brw );
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 320e40e1007..cbfd5855410 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -40,7 +40,7 @@
 #include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
-static GLuint double_types[5] = {
+static const GLuint double_types[5] = {
    0,
    BRW_SURFACEFORMAT_R64_FLOAT,
    BRW_SURFACEFORMAT_R64G64_FLOAT,
@@ -48,7 +48,7 @@ static GLuint double_types[5] = {
    BRW_SURFACEFORMAT_R64G64B64A64_FLOAT
 };
 
-static GLuint float_types[5] = {
+static const GLuint float_types[5] = {
    0,
    BRW_SURFACEFORMAT_R32_FLOAT,
    BRW_SURFACEFORMAT_R32G32_FLOAT,
@@ -56,7 +56,7 @@ static GLuint float_types[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_FLOAT
 };
 
-static GLuint half_float_types[5] = {
+static const GLuint half_float_types[5] = {
    0,
    BRW_SURFACEFORMAT_R16_FLOAT,
    BRW_SURFACEFORMAT_R16G16_FLOAT,
@@ -64,7 +64,7 @@ static GLuint half_float_types[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_FLOAT
 };
 
-static GLuint fixed_point_types[5] = {
+static const GLuint fixed_point_types[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SFIXED,
    BRW_SURFACEFORMAT_R32G32_SFIXED,
@@ -72,7 +72,7 @@ static GLuint fixed_point_types[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SFIXED,
 };
 
-static GLuint uint_types_direct[5] = {
+static const GLuint uint_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R32_UINT,
    BRW_SURFACEFORMAT_R32G32_UINT,
@@ -80,7 +80,7 @@ static GLuint uint_types_direct[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_UINT
 };
 
-static GLuint uint_types_norm[5] = {
+static const GLuint uint_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R32_UNORM,
    BRW_SURFACEFORMAT_R32G32_UNORM,
@@ -88,7 +88,7 @@ static GLuint uint_types_norm[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_UNORM
 };
 
-static GLuint uint_types_scale[5] = {
+static const GLuint uint_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R32_USCALED,
    BRW_SURFACEFORMAT_R32G32_USCALED,
@@ -96,7 +96,7 @@ static GLuint uint_types_scale[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_USCALED
 };
 
-static GLuint int_types_direct[5] = {
+static const GLuint int_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SINT,
    BRW_SURFACEFORMAT_R32G32_SINT,
@@ -104,7 +104,7 @@ static GLuint int_types_direct[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SINT
 };
 
-static GLuint int_types_norm[5] = {
+static const GLuint int_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SNORM,
    BRW_SURFACEFORMAT_R32G32_SNORM,
@@ -112,7 +112,7 @@ static GLuint int_types_norm[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SNORM
 };
 
-static GLuint int_types_scale[5] = {
+static const GLuint int_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R32_SSCALED,
    BRW_SURFACEFORMAT_R32G32_SSCALED,
@@ -120,7 +120,7 @@ static GLuint int_types_scale[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_SSCALED
 };
 
-static GLuint ushort_types_direct[5] = {
+static const GLuint ushort_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R16_UINT,
    BRW_SURFACEFORMAT_R16G16_UINT,
@@ -128,7 +128,7 @@ static GLuint ushort_types_direct[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_UINT
 };
 
-static GLuint ushort_types_norm[5] = {
+static const GLuint ushort_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R16_UNORM,
    BRW_SURFACEFORMAT_R16G16_UNORM,
@@ -136,7 +136,7 @@ static GLuint ushort_types_norm[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_UNORM
 };
 
-static GLuint ushort_types_scale[5] = {
+static const GLuint ushort_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R16_USCALED,
    BRW_SURFACEFORMAT_R16G16_USCALED,
@@ -144,7 +144,7 @@ static GLuint ushort_types_scale[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_USCALED
 };
 
-static GLuint short_types_direct[5] = {
+static const GLuint short_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SINT,
    BRW_SURFACEFORMAT_R16G16_SINT,
@@ -152,7 +152,7 @@ static GLuint short_types_direct[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SINT
 };
 
-static GLuint short_types_norm[5] = {
+static const GLuint short_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SNORM,
    BRW_SURFACEFORMAT_R16G16_SNORM,
@@ -160,7 +160,7 @@ static GLuint short_types_norm[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SNORM
 };
 
-static GLuint short_types_scale[5] = {
+static const GLuint short_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R16_SSCALED,
    BRW_SURFACEFORMAT_R16G16_SSCALED,
@@ -168,7 +168,7 @@ static GLuint short_types_scale[5] = {
    BRW_SURFACEFORMAT_R16G16B16A16_SSCALED
 };
 
-static GLuint ubyte_types_direct[5] = {
+static const GLuint ubyte_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R8_UINT,
    BRW_SURFACEFORMAT_R8G8_UINT,
@@ -176,7 +176,7 @@ static GLuint ubyte_types_direct[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_UINT
 };
 
-static GLuint ubyte_types_norm[5] = {
+static const GLuint ubyte_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R8_UNORM,
    BRW_SURFACEFORMAT_R8G8_UNORM,
@@ -184,7 +184,7 @@ static GLuint ubyte_types_norm[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_UNORM
 };
 
-static GLuint ubyte_types_scale[5] = {
+static const GLuint ubyte_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R8_USCALED,
    BRW_SURFACEFORMAT_R8G8_USCALED,
@@ -192,7 +192,7 @@ static GLuint ubyte_types_scale[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_USCALED
 };
 
-static GLuint byte_types_direct[5] = {
+static const GLuint byte_types_direct[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SINT,
    BRW_SURFACEFORMAT_R8G8_SINT,
@@ -200,7 +200,7 @@ static GLuint byte_types_direct[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_SINT
 };
 
-static GLuint byte_types_norm[5] = {
+static const GLuint byte_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SNORM,
    BRW_SURFACEFORMAT_R8G8_SNORM,
@@ -208,7 +208,7 @@ static GLuint byte_types_norm[5] = {
    BRW_SURFACEFORMAT_R8G8B8A8_SNORM
 };
 
-static GLuint byte_types_scale[5] = {
+static const GLuint byte_types_scale[5] = {
    0,
    BRW_SURFACEFORMAT_R8_SSCALED,
    BRW_SURFACEFORMAT_R8G8_SSCALED,
@@ -230,7 +230,7 @@ brw_get_vertex_surface_type(struct brw_context *brw,
 
    if (unlikely(INTEL_DEBUG & DEBUG_VERTS))
       fprintf(stderr, "type %s size %d normalized %d\n",
-              _mesa_lookup_enum_by_nr(glarray->Type),
+              _mesa_enum_to_string(glarray->Type),
               glarray->Size, glarray->Normalized);
 
    if (glarray->Integer) {
@@ -604,14 +604,15 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
 /**
  * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
  */
-static void
+static uint32_t *
 emit_vertex_buffer_state(struct brw_context *brw,
                          unsigned buffer_nr,
                          drm_intel_bo *bo,
                          unsigned bo_ending_address,
                          unsigned bo_offset,
                          unsigned stride,
-                         unsigned step_rate)
+                         unsigned step_rate,
+                         uint32_t *__map)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t dw0;
@@ -643,9 +644,13 @@ emit_vertex_buffer_state(struct brw_context *brw,
       OUT_BATCH(0);
    }
    OUT_BATCH(step_rate);
-}
 
-static void brw_emit_vertices(struct brw_context *brw)
+   return __map;
+}
+#define EMIT_VERTEX_BUFFER_STATE(...) __map = emit_vertex_buffer_state(__VA_ARGS__, __map)
+
+static void
+brw_emit_vertices(struct brw_context *brw)
 {
    GLuint i;
 
@@ -704,14 +709,14 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
       for (i = 0; i < brw->vb.nr_buffers; i++) {
 	 struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
-         emit_vertex_buffer_state(brw, i, buffer->bo, buffer->bo->size - 1,
+         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1,
                                   buffer->offset, buffer->stride,
                                   buffer->step_rate);
 
       }
 
       if (brw->vs.prog_data->uses_vertexid) {
-         emit_vertex_buffer_state(brw, brw->vb.nr_buffers,
+         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
                                   brw->draw.draw_params_bo,
                                   brw->draw.draw_params_bo->size - 1,
                                   brw->draw.draw_params_offset,
@@ -855,7 +860,8 @@ const struct brw_tracked_state brw_vertices = {
    .emit = brw_emit_vertices,
 };
 
-static void brw_upload_indices(struct brw_context *brw)
+static void
+brw_upload_indices(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
@@ -935,7 +941,8 @@ const struct brw_tracked_state brw_indices = {
    .emit = brw_upload_indices,
 };
 
-static void brw_emit_index_buffer(struct brw_context *brw)
+static void
+brw_emit_index_buffer(struct brw_context *brw)
 {
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
    GLuint cut_index_setting;
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 0f536046f6f..4d397622fc1 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1584,8 +1584,8 @@ brw_ENDIF(struct brw_codegen *p)
    }
 
    if (devinfo->gen < 6) {
-      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, brw_imm_d(0x0));
    } else if (devinfo->gen == 6) {
       brw_set_dest(p, insn, brw_imm_w(0));
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8984b4cb3ca..0e091ddc227 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -68,28 +68,6 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 
    assert(dst.file != IMM && dst.file != UNIFORM);
 
-   /* If exec_size == 0, try to guess it from the registers.  Since all
-    * manner of things may use hardware registers, we first try to guess
-    * based on GRF registers.  If this fails, we will go ahead and take the
-    * width from the destination register.
-    */
-   if (this->exec_size == 0) {
-      if (dst.file == GRF) {
-         this->exec_size = dst.width;
-      } else {
-         for (unsigned i = 0; i < sources; ++i) {
-            if (src[i].file != GRF && src[i].file != ATTR)
-               continue;
-
-            if (this->exec_size <= 1)
-               this->exec_size = src[i].width;
-            assert(src[i].width == 1 || src[i].width == this->exec_size);
-         }
-      }
-
-      if (this->exec_size == 0 && dst.file != BAD_FILE)
-         this->exec_size = dst.width;
-   }
    assert(this->exec_size != 0);
 
    this->conditional_mod = BRW_CONDITIONAL_NONE;
@@ -100,8 +78,8 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case HW_REG:
    case MRF:
    case ATTR:
-      this->regs_written =
-         DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
+      this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
+                                        REG_SIZE);
       break;
    case BAD_FILE:
       this->regs_written = 0;
@@ -126,9 +104,9 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
    init(opcode, exec_size, reg_undef, NULL, 0);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
 {
-   init(opcode, 0, dst, NULL, 0);
+   init(opcode, exec_size, dst, NULL, 0);
 }
 
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
@@ -138,12 +116,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 1);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
-   const fs_reg src[1] = { src0 };
-   init(opcode, 0, dst, src, 1);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0, const fs_reg &src1)
 {
@@ -151,13 +123,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 2);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1)
-{
-   const fs_reg src[2] = { src0, src1 };
-   init(opcode, 0, dst, src, 2);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 {
@@ -165,19 +130,6 @@ fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    init(opcode, exec_size, dst, src, 3);
 }
 
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1, const fs_reg &src2)
-{
-   const fs_reg src[3] = { src0, src1, src2 };
-   init(opcode, 0, dst, src, 3);
-}
-
-fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
-                 const fs_reg src[], unsigned sources)
-{
-   init(opcode, 0, dst, src, sources);
-}
-
 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
                  const fs_reg src[], unsigned sources)
 {
@@ -236,7 +188,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 
    int scale = 1;
-   if (devinfo->gen == 4 && dst.width == 8) {
+   if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
        * u, v, r) as parameters, or we can just use the SIMD16 message
        * consisting of (header, u).  We choose the second, at the cost of a
@@ -251,10 +203,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    else
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 
-   assert(dst.width % 8 == 0);
-   int regs_written = 4 * (dst.width / 8) * scale;
-   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
-                               dst.type, dst.width);
+   int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
+   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
 
@@ -264,10 +214,10 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
       if (devinfo->gen == 4)
          inst->mlen = 3;
       else
-         inst->mlen = 1 + dispatch_width / 8;
+         inst->mlen = 1 + bld.dispatch_width() / 8;
    }
 
-   bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
+   bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
 }
 
 /**
@@ -358,10 +308,14 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 
    for (int i = 0; i < this->sources; i++) {
       reg.type = this->src[i].type;
-      reg.width = this->src[i].width;
       if (!this->src[i].equals(reg))
          return false;
-      reg = ::offset(reg, 1);
+
+      if (i < this->header_size) {
+         reg.reg_offset += 1;
+      } else {
+         reg.reg_offset += this->exec_size / 8;
+      }
    }
 
    return true;
@@ -408,8 +362,8 @@ fs_reg::fs_reg(float f)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_F;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.f = f;
-   this->width = 1;
 }
 
 /** Immediate value constructor. */
@@ -418,8 +372,8 @@ fs_reg::fs_reg(int32_t i)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_D;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.d = i;
-   this->width = 1;
 }
 
 /** Immediate value constructor. */
@@ -428,8 +382,8 @@ fs_reg::fs_reg(uint32_t u)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_UD;
+   this->stride = 0;
    this->fixed_hw_reg.dw1.ud = u;
-   this->width = 1;
 }
 
 /** Vector float immediate value constructor. */
@@ -460,7 +414,6 @@ fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
    this->file = HW_REG;
    this->fixed_hw_reg = fixed_hw_reg;
    this->type = fixed_hw_reg.type;
-   this->width = 1 << fixed_hw_reg.width;
 }
 
 bool
@@ -475,7 +428,6 @@ fs_reg::equals(const fs_reg &r) const
            abs == r.abs &&
            !reladdr && !r.reladdr &&
            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
-           width == r.width &&
            stride == r.stride);
 }
 
@@ -494,6 +446,15 @@ fs_reg::is_contiguous() const
    return stride == 1;
 }
 
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+   const unsigned stride = (file != HW_REG ? this->stride :
+                            fixed_hw_reg.hstride == 0 ? 0 :
+                            1 << (fixed_hw_reg.hstride - 1));
+   return MAX2(width * stride, 1) * type_sz(type);
+}
+
 int
 fs_visitor::type_size(const struct glsl_type *type)
 {
@@ -520,7 +481,10 @@ fs_visitor::type_size(const struct glsl_type *type)
       return 0;
    case GLSL_TYPE_ATOMIC_UINT:
       return 0;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
    case GLSL_TYPE_IMAGE:
+      return BRW_IMAGE_PARAM_SIZE;
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
@@ -548,12 +512,12 @@ fs_visitor::get_timestamp(const fs_builder &bld)
                                           0),
                              BRW_REGISTER_TYPE_UD));
 
-   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
+   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
     */
-   bld.exec_all().MOV(dst, ts);
+   bld.group(4, 0).exec_all().MOV(dst, ts);
 
    /* The caller wants the low 32 bits of the timestamp.  Since it's running
     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
@@ -598,19 +562,21 @@ fs_visitor::emit_shader_time_end()
 
    fs_reg start = shader_start_time;
    start.negate = true;
-   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
+   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    diff.set_smear(0);
-   ibld.ADD(diff, start, shader_end_time);
+
+   const fs_builder cbld = ibld.group(1, 0);
+   cbld.group(1, 0).ADD(diff, start, shader_end_time);
 
    /* If there were no instructions between the two timestamp gets, the diff
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   ibld.ADD(diff, diff, fs_reg(-2u));
-   SHADER_TIME_ADD(ibld, 0, diff);
-   SHADER_TIME_ADD(ibld, 1, fs_reg(1u));
+   cbld.ADD(diff, diff, fs_reg(-2u));
+   SHADER_TIME_ADD(cbld, 0, diff);
+   SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
    ibld.emit(BRW_OPCODE_ELSE);
-   SHADER_TIME_ADD(ibld, 2, fs_reg(1u));
+   SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
    ibld.emit(BRW_OPCODE_ENDIF);
 }
 
@@ -695,50 +661,160 @@ bool
 fs_inst::is_partial_write() const
 {
    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
-           (this->dst.width * type_sz(this->dst.type)) < 32 ||
+           (this->exec_size * type_sz(this->dst.type)) < 32 ||
            !this->dst.is_contiguous());
 }
 
+unsigned
+fs_inst::components_read(unsigned i) const
+{
+   switch (opcode) {
+   case FS_OPCODE_LINTERP:
+      if (i == 0)
+         return 2;
+      else
+         return 1;
+
+   case FS_OPCODE_PIXEL_X:
+   case FS_OPCODE_PIXEL_Y:
+      assert(i == 0);
+      return 2;
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      assert(src[6].file == IMM);
+      /* First/second FB write color. */
+      if (i < 2)
+         return src[6].fixed_hw_reg.dw1.ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      assert(src[8].file == IMM && src[9].file == IMM);
+      /* Texture coordinates. */
+      if (i == 0)
+         return src[8].fixed_hw_reg.dw1.ud;
+      /* Texture derivatives. */
+      else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
+         return src[9].fixed_hw_reg.dw1.ud;
+      /* Texture offset. */
+      else if (i == 7)
+         return 2;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      assert(src[3].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source (ignored for reads). */
+      else if (i == 1)
+         return 0;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1)
+         return src[4].fixed_hw_reg.dw1.ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1 && op == BRW_AOP_CMPWR)
+         return 2;
+      else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+                          op == BRW_AOP_PREDEC))
+         return 0;
+      else
+         return 1;
+   }
+
+   default:
+      return 1;
+   }
+}
+
 int
 fs_inst::regs_read(int arg) const
 {
-   if (is_tex() && arg == 0 && src[0].file == GRF) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
-      return mlen;
-   } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
-      return mlen;
-   } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
-      return exec_size / 4;
+   switch (opcode) {
+   case FS_OPCODE_FB_WRITE:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      if (arg == 0)
+         return mlen;
+      break;
+
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+      /* The payload is actually stored in src1 */
+      if (arg == 1)
+         return mlen;
+      break;
+
+   case FS_OPCODE_LINTERP:
+      if (arg == 1)
+         return 1;
+      break;
+
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      if (arg < this->header_size)
+         return 1;
+      break;
+
+   case CS_OPCODE_CS_TERMINATE:
+      return 1;
+
+   default:
+      if (is_tex() && arg == 0 && src[0].file == GRF)
+         return mlen;
+      break;
    }
 
    switch (src[arg].file) {
    case BAD_FILE:
+      return 0;
    case UNIFORM:
    case IMM:
       return 1;
    case GRF:
+   case ATTR:
    case HW_REG:
-      if (src[arg].stride == 0) {
-         return 1;
-      } else {
-         int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
-         return (size + 31) / 32;
-      }
+      return DIV_ROUND_UP(components_read(arg) *
+                          src[arg].component_size(exec_size),
+                          REG_SIZE);
    case MRF:
       unreachable("MRF registers are not allowed as sources");
    default:
@@ -832,7 +908,7 @@ fs_visitor::vgrf(const glsl_type *const type)
 {
    int reg_width = dispatch_width / 8;
    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
-                 brw_type_for_base_type(type), dispatch_width);
+                 brw_type_for_base_type(type));
 }
 
 /** Fixed HW reg constructor. */
@@ -842,14 +918,7 @@ fs_reg::fs_reg(enum register_file file, int reg)
    this->file = file;
    this->reg = reg;
    this->type = BRW_REGISTER_TYPE_F;
-
-   switch (file) {
-   case UNIFORM:
-      this->width = 1;
-      break;
-   default:
-      this->width = 8;
-   }
+   this->stride = (file == UNIFORM ? 0 : 1);
 }
 
 /** Fixed HW reg constructor. */
@@ -859,25 +928,7 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
    this->file = file;
    this->reg = reg;
    this->type = type;
-
-   switch (file) {
-   case UNIFORM:
-      this->width = 1;
-      break;
-   default:
-      this->width = 8;
-   }
-}
-
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
-               uint8_t width)
-{
-   init();
-   this->file = file;
-   this->reg = reg;
-   this->type = type;
-   this->width = width;
+   this->stride = (file == UNIFORM ? 0 : 1);
 }
 
 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
@@ -892,6 +943,18 @@ fs_visitor::import_uniforms(fs_visitor *v)
    this->param_size = v->param_size;
 }
 
+void
+fs_visitor::setup_vector_uniform_values(const gl_constant_value *values, unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      stage_prog_data->param[uniforms++] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      stage_prog_data->param[uniforms++] = &zero;
+}
+
 fs_reg *
 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
                                          bool origin_upper_left)
@@ -908,23 +971,23 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
    } else {
       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.y */
    if (!flip && pixel_center_integer) {
       bld.MOV(wpos, this->pixel_y);
    } else {
       fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0 : 0.5);
+      float offset = (pixel_center_integer ? 0.0f : 0.5f);
 
       if (flip) {
 	 pixel_y.negate = true;
-	 offset += key->drawable_height - 1.0;
+	 offset += key->drawable_height - 1.0f;
       }
 
       bld.ADD(wpos, pixel_y, fs_reg(offset));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.z */
    if (devinfo->gen >= 6) {
@@ -934,7 +997,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
            interp_reg(VARYING_SLOT_POS, 2));
    }
-   wpos = offset(wpos, 1);
+   wpos = offset(wpos, bld, 1);
 
    /* gl_FragCoord.w: Already set up in emit_interpolation */
    bld.MOV(wpos, this->wpos_w);
@@ -1017,7 +1080,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 	    /* If there's no incoming setup data for this slot, don't
 	     * emit interpolation for it.
 	     */
-	    attr = offset(attr, type->vector_elements);
+	    attr = offset(attr, bld, type->vector_elements);
 	    location++;
 	    continue;
 	 }
@@ -1032,7 +1095,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 	       interp = suboffset(interp, 3);
                interp.type = attr.type;
                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
-	       attr = offset(attr, 1);
+	       attr = offset(attr, bld, 1);
 	    }
 	 } else {
 	    /* Smooth/noperspective interpolation case. */
@@ -1070,7 +1133,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
                   bld.MUL(attr, attr, this->pixel_w);
                }
-	       attr = offset(attr, 1);
+	       attr = offset(attr, bld, 1);
 	    }
 
 	 }
@@ -1178,7 +1241,7 @@ fs_visitor::emit_samplepos_setup()
    }
    /* Compute gl_SamplePosition.x */
    compute_sample_position(pos, int_sample_x);
-   pos = offset(pos, 1);
+   pos = offset(pos, abld, 1);
    if (dispatch_width == 8) {
       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
    } else {
@@ -1250,15 +1313,16 @@ fs_visitor::emit_sampleid_setup()
    return reg;
 }
 
-void
-fs_visitor::resolve_source_modifiers(fs_reg *src)
+fs_reg
+fs_visitor::resolve_source_modifiers(const fs_reg &src)
 {
-   if (!src->abs && !src->negate)
-      return;
+   if (!src.abs && !src.negate)
+      return src;
 
-   fs_reg temp = bld.vgrf(src->type);
-   bld.MOV(temp, *src);
-   *src = temp;
+   fs_reg temp = bld.vgrf(src.type);
+   bld.MOV(temp, src);
+
+   return temp;
 }
 
 void
@@ -1318,6 +1382,7 @@ fs_visitor::assign_curb_setup()
 						  constant_nr / 8,
 						  constant_nr % 8);
 
+            assert(inst->src[i].stride == 0);
 	    inst->src[i].file = HW_REG;
 	    inst->src[i].fixed_hw_reg = byte_offset(
                retype(brw_reg, inst->src[i].type),
@@ -1867,11 +1932,12 @@ fs_visitor::demote_pull_constants()
 	    continue;
 
          /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
-                                    .at(block, inst);
+         const fs_builder ibld(this, block, inst);
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
          fs_reg dst = vgrf(glsl_type::float_type);
 
+         assert(inst->src[i].stride == 0);
+
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
@@ -1879,9 +1945,11 @@ fs_visitor::demote_pull_constants()
                                        *inst->src[i].reladdr,
                                        pull_index);
             inst->src[i].reladdr = NULL;
+            inst->src[i].stride = 1;
          } else {
+            const fs_builder ubld = ibld.exec_all().group(8, 0);
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
                       dst, surf_index, offset);
             inst->src[i].set_smear(pull_index & 3);
          }
@@ -1890,7 +1958,6 @@ fs_visitor::demote_pull_constants()
          inst->src[i].file = GRF;
          inst->src[i].reg = dst.reg;
          inst->src[i].reg_offset = 0;
-         inst->src[i].width = dispatch_width;
       }
    }
    invalidate_live_intervals();
@@ -2158,11 +2225,11 @@ fs_visitor::opt_zero_samples()
        *     "Parameter 0 is required except for the sampleinfo message, which
        *      has no parameter 0"
        */
-      while (inst->mlen > inst->header_size + dispatch_width / 8 &&
+      while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
              load_payload->src[(inst->mlen - inst->header_size) /
-                               (dispatch_width / 8) +
+                               (inst->exec_size / 8) +
                                inst->header_size - 1].is_zero()) {
-         inst->mlen -= dispatch_width / 8;
+         inst->mlen -= inst->exec_size / 8;
          progress = true;
       }
    }
@@ -2199,7 +2266,8 @@ fs_visitor::opt_sampler_eot()
       return false;
 
    /* Look for a texturing instruction immediately before the final FB_WRITE. */
-   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+   fs_inst *fb_write = (fs_inst *)block->end();
    assert(fb_write->eot);
    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
 
@@ -2230,9 +2298,11 @@ fs_visitor::opt_sampler_eot()
    assert(!tex_inst->eot); /* We can't get here twice */
    assert((tex_inst->offset & (0xff << 24)) == 0);
 
+   const fs_builder ibld(this, block, tex_inst);
+
    tex_inst->offset |= fb_write->target << 24;
    tex_inst->eot = true;
-   tex_inst->dst = bld.null_reg_ud();
+   tex_inst->dst = ibld.null_reg_ud();
    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
 
    /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2244,8 +2314,8 @@ fs_visitor::opt_sampler_eot()
    if (tex_inst->header_size != 0)
       return true;
 
-   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
-                                 load_payload->sources + 1);
+   fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+                                  load_payload->sources + 1);
    fs_reg *new_sources =
       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
 
@@ -2307,12 +2377,12 @@ fs_visitor::opt_register_renaming()
 
       if (depth == 0 &&
           inst->dst.file == GRF &&
-          alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
+          alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
          } else {
-            remap[dst] = alloc.allocate(inst->dst.width / 8);
+            remap[dst] = alloc.allocate(inst->exec_size / 8);
             inst->dst.reg = remap[dst];
             progress = true;
          }
@@ -2443,7 +2513,7 @@ fs_visitor::compute_to_mrf()
             /* Things returning more than one register would need us to
              * understand coalescing out more than one MOV at a time.
              */
-            if (scan_inst->regs_written > scan_inst->dst.width / 8)
+            if (scan_inst->regs_written > scan_inst->exec_size / 8)
                break;
 
 	    /* SEND instructions can't have MRF as a destination. */
@@ -2780,7 +2850,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
       if (block->start() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
          }
          return;
       }
@@ -2796,7 +2867,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
                 needs_dep[reg - first_write_grf]) {
-               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
                needs_dep[reg - first_write_grf] = false;
                if (scan_inst->exec_size == 16)
                   needs_dep[reg - first_write_grf + 1] = false;
@@ -2843,7 +2914,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       if (block->end() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
          }
          return;
       }
@@ -2858,7 +2930,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
           scan_inst->dst.reg >= first_write_grf &&
           scan_inst->dst.reg < first_write_grf + write_len &&
           needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.reg);
          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
       }
 
@@ -2928,14 +3001,18 @@ fs_visitor::lower_uniform_pull_constant_loads()
          assert(const_offset_reg.file == IMM &&
                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
-         fs_reg payload = fs_reg(GRF, alloc.allocate(1));
 
-         /* We have to use a message header on Skylake to get SIMD4x2 mode.
-          * Reserve space for the register.
-          */
+         fs_reg payload, offset;
          if (devinfo->gen >= 9) {
-            payload.reg_offset++;
-            alloc.sizes[payload.reg] = 2;
+            /* We have to use a message header on Skylake to get SIMD4x2
+             * mode.  Reserve space for the register.
+            */
+            offset = payload = fs_reg(GRF, alloc.allocate(2));
+            offset.reg_offset++;
+            inst->mlen = 2;
+         } else {
+            offset = payload = fs_reg(GRF, alloc.allocate(1));
+            inst->mlen = 1;
          }
 
          /* This is actually going to be a MOV, but since only the first dword
@@ -2944,7 +3021,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
           * by live variable analysis, or register allocation will explode.
           */
          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
-                                               8, payload, const_offset_reg);
+                                               8, offset, const_offset_reg);
          setup->force_writemask_all = true;
 
          setup->ir = inst->ir;
@@ -2957,6 +3034,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
           */
          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
          inst->src[1] = payload;
+         inst->base_mrf = -1;
 
          invalidate_live_intervals();
       } else {
@@ -2982,28 +3060,24 @@ fs_visitor::lower_load_payload()
 
       assert(inst->dst.file == MRF || inst->dst.file == GRF);
       assert(inst->saturate == false);
-
-      const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
-                                 .exec_all(inst->force_writemask_all)
-                                 .at(block, inst);
       fs_reg dst = inst->dst;
 
       /* Get rid of COMPR4.  We'll add it back in if we need it */
       if (dst.file == MRF)
          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
 
-      dst.width = 8;
+      const fs_builder ibld(this, block, inst);
+      const fs_builder hbld = ibld.exec_all().group(8, 0);
+
       for (uint8_t i = 0; i < inst->header_size; i++) {
          if (inst->src[i].file != BAD_FILE) {
             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
-            mov_src.width = 8;
-            ibld.exec_all().MOV(mov_dst, mov_src);
+            hbld.MOV(mov_dst, mov_src);
          }
-         dst = offset(dst, 1);
+         dst = offset(dst, hbld, 1);
       }
 
-      dst.width = inst->exec_size;
       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
           inst->exec_size > 8) {
          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
@@ -3033,9 +3107,9 @@ fs_visitor::lower_load_payload()
                } else {
                   /* Platform doesn't have COMPR4.  We have to fake it */
                   fs_reg mov_dst = retype(dst, inst->src[i].type);
-                  mov_dst.width = 8;
                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
-                  ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
+                  mov_dst.reg += 4;
+                  ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
                }
             }
 
@@ -3060,7 +3134,7 @@ fs_visitor::lower_load_payload()
       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
          if (inst->src[i].file != BAD_FILE)
             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
-         dst = offset(dst, 1);
+         dst = offset(dst, ibld, 1);
       }
 
       inst->remove(block);
@@ -3078,155 +3152,208 @@ fs_visitor::lower_integer_multiplication()
 {
    bool progress = false;
 
-   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
-    * directly, but Cherryview cannot.
-    */
-   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
-      return false;
-
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      if (inst->opcode != BRW_OPCODE_MUL ||
-          inst->dst.is_accumulator() ||
-          (inst->dst.type != BRW_REGISTER_TYPE_D &&
-           inst->dst.type != BRW_REGISTER_TYPE_UD))
-         continue;
+      const fs_builder ibld(this, block, inst);
 
-      const fs_builder ibld = bld.at(block, inst);
+      if (inst->opcode == BRW_OPCODE_MUL) {
+         if (inst->dst.is_accumulator() ||
+             (inst->dst.type != BRW_REGISTER_TYPE_D &&
+              inst->dst.type != BRW_REGISTER_TYPE_UD))
+            continue;
 
-      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
-       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
-       * src1 are used.
-       *
-       * If multiplying by an immediate value that fits in 16-bits, do a
-       * single MUL instruction with that value in the proper location.
-       */
-      if (inst->src[1].file == IMM &&
-          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
-         if (devinfo->gen < 7) {
-            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
-                       inst->dst.type, dispatch_width);
-            ibld.MOV(imm, inst->src[1]);
-            ibld.MUL(inst->dst, imm, inst->src[0]);
+         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
+          * operation directly, but CHV/BXT cannot.
+          */
+         if (devinfo->gen >= 8 &&
+             !devinfo->is_cherryview && !devinfo->is_broxton)
+            continue;
+
+         if (inst->src[1].file == IMM &&
+             inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+             * src1 are used.
+             *
+             * If multiplying by an immediate value that fits in 16-bits, do a
+             * single MUL instruction with that value in the proper location.
+             */
+            if (devinfo->gen < 7) {
+               fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+                          inst->dst.type);
+               ibld.MOV(imm, inst->src[1]);
+               ibld.MUL(inst->dst, imm, inst->src[0]);
+            } else {
+               ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+            }
          } else {
-            ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+            /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+             * do 32-bit integer multiplication in one instruction, but instead
+             * must do a sequence (which actually calculates a 64-bit result):
+             *
+             *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
+             *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
+             *    mov(8)  g2<1>D     acc0<8,8,1>D
+             *
+             * But on Gen > 6, the ability to use second accumulator register
+             * (acc1) for non-float data types was removed, preventing a simple
+             * implementation in SIMD16. A 16-channel result can be calculated by
+             * executing the three instructions twice in SIMD8, once with quarter
+             * control of 1Q for the first eight channels and again with 2Q for
+             * the second eight channels.
+             *
+             * Which accumulator register is implicitly accessed (by AccWrEnable
+             * for instance) is determined by the quarter control. Unfortunately
+             * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+             * implicit accumulator access by an instruction with 2Q will access
+             * acc1 regardless of whether the data type is usable in acc1.
+             *
+             * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+             * integer data types.
+             *
+             * Since we only want the low 32-bits of the result, we can do two
+             * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+             * adjust the high result and add them (like the mach is doing):
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
+             *    shl(8)  g9<1>D     g8<8,8,1>D      16D
+             *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
+             *
+             * We avoid the shl instruction by realizing that we only want to add
+             * the low 16-bits of the "high" result to the high 16-bits of the
+             * "low" result and using proper regioning on the add:
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
+             *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
+             *
+             * Since it does not use the (single) accumulator register, we can
+             * schedule multi-component multiplications much better.
+             */
+
+            if (inst->conditional_mod && inst->dst.is_null()) {
+               inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+                                  inst->dst.type);
+            }
+            fs_reg low = inst->dst;
+            fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+                        inst->dst.type);
+
+            if (devinfo->gen >= 7) {
+               fs_reg src1_0_w = inst->src[1];
+               fs_reg src1_1_w = inst->src[1];
+
+               if (inst->src[1].file == IMM) {
+                  src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
+                  src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+               } else {
+                  src1_0_w.type = BRW_REGISTER_TYPE_UW;
+                  if (src1_0_w.stride != 0) {
+                     assert(src1_0_w.stride == 1);
+                     src1_0_w.stride = 2;
+                  }
+
+                  src1_1_w.type = BRW_REGISTER_TYPE_UW;
+                  if (src1_1_w.stride != 0) {
+                     assert(src1_1_w.stride == 1);
+                     src1_1_w.stride = 2;
+                  }
+                  src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+               }
+               ibld.MUL(low, inst->src[0], src1_0_w);
+               ibld.MUL(high, inst->src[0], src1_1_w);
+            } else {
+               fs_reg src0_0_w = inst->src[0];
+               fs_reg src0_1_w = inst->src[0];
+
+               src0_0_w.type = BRW_REGISTER_TYPE_UW;
+               if (src0_0_w.stride != 0) {
+                  assert(src0_0_w.stride == 1);
+                  src0_0_w.stride = 2;
+               }
+
+               src0_1_w.type = BRW_REGISTER_TYPE_UW;
+               if (src0_1_w.stride != 0) {
+                  assert(src0_1_w.stride == 1);
+                  src0_1_w.stride = 2;
+               }
+               src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+
+               ibld.MUL(low, src0_0_w, inst->src[1]);
+               ibld.MUL(high, src0_1_w, inst->src[1]);
+            }
+
+            fs_reg dst = inst->dst;
+            dst.type = BRW_REGISTER_TYPE_UW;
+            dst.subreg_offset = 2;
+            dst.stride = 2;
+
+            high.type = BRW_REGISTER_TYPE_UW;
+            high.stride = 2;
+
+            low.type = BRW_REGISTER_TYPE_UW;
+            low.subreg_offset = 2;
+            low.stride = 2;
+
+            ibld.ADD(dst, low, high);
+
+            if (inst->conditional_mod) {
+               fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+               set_condmod(inst->conditional_mod,
+                           ibld.MOV(null, inst->dst));
+            }
+         }
+
+      } else if (inst->opcode == SHADER_OPCODE_MULH) {
+         /* Should have been lowered to 8-wide. */
+         assert(inst->exec_size <= 8);
+         const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
+                                   inst->dst.type);
+         fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
+         fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+
+         if (devinfo->gen >= 8) {
+            /* Until Gen8, integer multiplies read 32-bits from one source,
+             * and 16-bits from the other, and relying on the MACH instruction
+             * to generate the high bits of the result.
+             *
+             * On Gen8, the multiply instruction does a full 32x32-bit
+             * multiply, but in order to do a 64-bit multiply we can simulate
+             * the previous behavior and then use a MACH instruction.
+             *
+             * FINISHME: Don't use source modifiers on src1.
+             */
+            assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+                   mul->src[1].type == BRW_REGISTER_TYPE_UD);
+            mul->src[1].type = (type_is_signed(mul->src[1].type) ?
+                                BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+            mul->src[1].stride *= 2;
+
+         } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
+                    inst->force_sechalf) {
+            /* Among other things the quarter control bits influence which
+             * accumulator register is used by the hardware for instructions
+             * that access the accumulator implicitly (e.g. MACH).  A
+             * second-half instruction would normally map to acc1, which
+             * doesn't exist on Gen7 and up (the hardware does emulate it for
+             * floating-point instructions *only* by taking advantage of the
+             * extra precision of acc0 not normally used for floating point
+             * arithmetic).
+             *
+             * HSW and up are careful enough not to try to access an
+             * accumulator register that doesn't exist, but on earlier Gen7
+             * hardware we need to make sure that the quarter control bits are
+             * zero to avoid non-deterministic behaviour and emit an extra MOV
+             * to get the result masked correctly according to the current
+             * channel enables.
+             */
+            mach->force_sechalf = false;
+            mach->force_writemask_all = true;
+            mach->dst = ibld.vgrf(inst->dst.type);
+            ibld.MOV(inst->dst, mach->dst);
          }
       } else {
-         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
-          * do 32-bit integer multiplication in one instruction, but instead
-          * must do a sequence (which actually calculates a 64-bit result):
-          *
-          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
-          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
-          *    mov(8)  g2<1>D     acc0<8,8,1>D
-          *
-          * But on Gen > 6, the ability to use second accumulator register
-          * (acc1) for non-float data types was removed, preventing a simple
-          * implementation in SIMD16. A 16-channel result can be calculated by
-          * executing the three instructions twice in SIMD8, once with quarter
-          * control of 1Q for the first eight channels and again with 2Q for
-          * the second eight channels.
-          *
-          * Which accumulator register is implicitly accessed (by AccWrEnable
-          * for instance) is determined by the quarter control. Unfortunately
-          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
-          * implicit accumulator access by an instruction with 2Q will access
-          * acc1 regardless of whether the data type is usable in acc1.
-          *
-          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
-          * integer data types.
-          *
-          * Since we only want the low 32-bits of the result, we can do two
-          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
-          * adjust the high result and add them (like the mach is doing):
-          *
-          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
-          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
-          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
-          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
-          *
-          * We avoid the shl instruction by realizing that we only want to add
-          * the low 16-bits of the "high" result to the high 16-bits of the
-          * "low" result and using proper regioning on the add:
-          *
-          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
-          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
-          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
-          *
-          * Since it does not use the (single) accumulator register, we can
-          * schedule multi-component multiplications much better.
-          */
-
-         if (inst->conditional_mod && inst->dst.is_null()) {
-            inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
-                               inst->dst.type, dispatch_width);
-         }
-         fs_reg low = inst->dst;
-         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
-                     inst->dst.type, dispatch_width);
-
-         if (devinfo->gen >= 7) {
-            fs_reg src1_0_w = inst->src[1];
-            fs_reg src1_1_w = inst->src[1];
-
-            if (inst->src[1].file == IMM) {
-               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
-               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
-            } else {
-               src1_0_w.type = BRW_REGISTER_TYPE_UW;
-               if (src1_0_w.stride != 0) {
-                  assert(src1_0_w.stride == 1);
-                  src1_0_w.stride = 2;
-               }
-
-               src1_1_w.type = BRW_REGISTER_TYPE_UW;
-               if (src1_1_w.stride != 0) {
-                  assert(src1_1_w.stride == 1);
-                  src1_1_w.stride = 2;
-               }
-               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
-            }
-            ibld.MUL(low, inst->src[0], src1_0_w);
-            ibld.MUL(high, inst->src[0], src1_1_w);
-         } else {
-            fs_reg src0_0_w = inst->src[0];
-            fs_reg src0_1_w = inst->src[0];
-
-            src0_0_w.type = BRW_REGISTER_TYPE_UW;
-            if (src0_0_w.stride != 0) {
-               assert(src0_0_w.stride == 1);
-               src0_0_w.stride = 2;
-            }
-
-            src0_1_w.type = BRW_REGISTER_TYPE_UW;
-            if (src0_1_w.stride != 0) {
-               assert(src0_1_w.stride == 1);
-               src0_1_w.stride = 2;
-            }
-            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
-
-            ibld.MUL(low, src0_0_w, inst->src[1]);
-            ibld.MUL(high, src0_1_w, inst->src[1]);
-         }
-
-         fs_reg dst = inst->dst;
-         dst.type = BRW_REGISTER_TYPE_UW;
-         dst.subreg_offset = 2;
-         dst.stride = 2;
-
-         high.type = BRW_REGISTER_TYPE_UW;
-         high.stride = 2;
-
-         low.type = BRW_REGISTER_TYPE_UW;
-         low.subreg_offset = 2;
-         low.stride = 2;
-
-         ibld.ADD(dst, low, high);
-
-         if (inst->conditional_mod) {
-            fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
-            set_condmod(inst->conditional_mod,
-                        ibld.MOV(null, inst->dst));
-         }
+         continue;
       }
 
       inst->remove(block);
@@ -3239,6 +3366,1043 @@ fs_visitor::lower_integer_multiplication()
    return progress;
 }
 
+static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+                    fs_reg *dst, fs_reg color, unsigned components)
+{
+   if (key->clamp_fragment_color) {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+      assert(color.type == BRW_REGISTER_TYPE_F);
+
+      for (unsigned i = 0; i < components; i++)
+         set_saturate(true,
+                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+      color = tmp;
+   }
+
+   for (unsigned i = 0; i < components; i++)
+      dst[i] = offset(color, bld, i);
+}
+
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+                            const brw_wm_prog_data *prog_data,
+                            const brw_wm_prog_key *key,
+                            const fs_visitor::thread_payload &payload)
+{
+   assert(inst->src[6].file == IMM);
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &color0 = inst->src[0];
+   const fs_reg &color1 = inst->src[1];
+   const fs_reg &src0_alpha = inst->src[2];
+   const fs_reg &src_depth = inst->src[3];
+   const fs_reg &dst_depth = inst->src[4];
+   fs_reg sample_mask = inst->src[5];
+   const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
+
+   /* We can potentially have a message length of up to 15, so we have to set
+    * base_mrf to either 0 or 1 in order to fit in m0..m15.
+    */
+   fs_reg sources[15];
+   int header_size = 2, payload_header_size;
+   unsigned length = 0;
+
+   /* From the Sandy Bridge PRM, volume 4, page 198:
+    *
+    *     "Dispatched Pixel Enables. One bit per pixel indicating
+    *      which pixels were originally enabled when the thread was
+    *      dispatched. This field is only required for the end-of-
+    *      thread message and on all dual-source messages."
+    */
+   if (devinfo->gen >= 6 &&
+       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
+       color1.file == BAD_FILE &&
+       key->nr_color_regions == 1) {
+      header_size = 0;
+   }
+
+   if (header_size != 0) {
+      assert(header_size == 2);
+      /* Allocate 2 registers for a header */
+      length += 2;
+   }
+
+   if (payload.aa_dest_stencil_reg) {
+      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
+      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+         .MOV(sources[length],
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+      length++;
+   }
+
+   if (prog_data->uses_omask) {
+      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
+                               BRW_REGISTER_TYPE_UD);
+
+      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
+       * relevant.  Since it's unsigned single words one vgrf is always
+       * 16-wide, but only the lower or higher 8 channels will be used by the
+       * hardware when doing a SIMD8 write depending on whether we have
+       * selected the subspans for the first or second half respectively.
+       */
+      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+      sample_mask.type = BRW_REGISTER_TYPE_UW;
+      sample_mask.stride *= 2;
+
+      bld.exec_all().annotate("FB write oMask")
+         .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
+                   inst->force_sechalf),
+              sample_mask);
+      length++;
+   }
+
+   payload_header_size = length;
+
+   if (src0_alpha.file != BAD_FILE) {
+      /* FIXME: This is being passed at the wrong location in the payload and
+       * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+       * It's supposed to be immediately before oMask but there seems to be no
+       * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+       * requires header sources to form a contiguous segment at the beginning
+       * of the message and src0_alpha has per-channel semantics.
+       */
+      setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
+      length++;
+   }
+
+   setup_color_payload(bld, key, &sources[length], color0, components);
+   length += 4;
+
+   if (color1.file != BAD_FILE) {
+      setup_color_payload(bld, key, &sources[length], color1, components);
+      length += 4;
+   }
+
+   if (src_depth.file != BAD_FILE) {
+      sources[length] = src_depth;
+      length++;
+   }
+
+   if (dst_depth.file != BAD_FILE) {
+      sources[length] = dst_depth;
+      length++;
+   }
+
+   fs_inst *load;
+   if (devinfo->gen >= 7) {
+      /* Send from the GRF */
+      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
+      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
+      payload.reg = bld.shader->alloc.allocate(load->regs_written);
+      load->dst = payload;
+
+      inst->src[0] = payload;
+      inst->resize_sources(1);
+      inst->base_mrf = -1;
+   } else {
+      /* Send from the MRF */
+      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+                              sources, length, payload_header_size);
+
+      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
+       * will do this for us if we just give it a COMPR4 destination.
+       */
+      if (devinfo->gen < 6 && bld.dispatch_width() == 16)
+         load->dst.reg |= BRW_MRF_COMPR4;
+
+      inst->resize_sources(0);
+      inst->base_mrf = 1;
+   }
+
+   inst->opcode = FS_OPCODE_FB_WRITE;
+   inst->mlen = load->regs_written;
+   inst->header_size = header_size;
+}
+
+static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &sampler,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+   fs_reg msg_end = msg_begin;
+
+   /* g0 header. */
+   msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+   for (unsigned i = 0; i < coord_components; i++)
+      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+              offset(coordinate, bld, i));
+
+   msg_end = offset(msg_end, bld, coord_components);
+
+   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+    * require all three components to be present and zero if they are unused.
+    */
+   if (coord_components > 0 &&
+       (has_lod || shadow_c.file != BAD_FILE ||
+        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+      for (unsigned i = coord_components; i < 3; i++)
+         bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+
+      msg_end = offset(msg_end, bld, 3 - coord_components);
+   }
+
+   if (op == SHADER_OPCODE_TXD) {
+      /* TXD unsupported in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* the slots for u and v are always present, but r is optional */
+      if (coord_components < 2)
+         msg_end = offset(msg_end, bld, 2 - coord_components);
+
+      /*  P   = u, v, r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * 1-arg: Does not exist.
+       *
+       * 2-arg: dudx   dvdx   dudy   dvdy
+       *        dPdx.x dPdx.y dPdy.x dPdy.y
+       *        m4     m5     m6     m7
+       *
+       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
+       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+       *        m5     m6     m7     m8     m9     m10
+       */
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+   }
+
+   if (has_lod) {
+      /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
+       * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+       */
+      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+             bld.dispatch_width() == 16);
+
+      const brw_reg_type type =
+         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+      bld.MOV(retype(msg_end, type), lod);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+         /* There's no plain shadow compare message, so we use shadow
+          * compare with a bias of 0.0.
+          */
+         bld.MOV(msg_end, fs_reg(0.0f));
+         msg_end = offset(msg_end, bld, 1);
+      }
+
+      bld.MOV(msg_end, shadow_c);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = msg_begin.reg;
+   inst->mlen = msg_end.reg - msg_begin.reg;
+   inst->header_size = 1;
+}
+
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
+                                fs_reg coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, fs_reg lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &sampler,
+                                const fs_reg &offset_value,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+   fs_reg msg_coords = message;
+   unsigned header_size = 0;
+
+   if (offset_value.file != BAD_FILE) {
+      /* The offsets set up by the visitor are in the m1 header, so we can't
+       * go headerless.
+       */
+      header_size = 1;
+      message.reg--;
+   }
+
+   for (unsigned i = 0; i < coord_components; i++) {
+      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
+      coordinate = offset(coordinate, bld, 1);
+   }
+   fs_reg msg_end = offset(msg_coords, bld, coord_components);
+   fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+   if (shadow_c.file != BAD_FILE) {
+      fs_reg msg_shadow = msg_lod;
+      bld.MOV(msg_shadow, shadow_c);
+      msg_lod = offset(msg_shadow, bld, 1);
+      msg_end = msg_lod;
+   }
+
+   switch (op) {
+   case SHADER_OPCODE_TXL:
+   case FS_OPCODE_TXB:
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXD:
+      /**
+       *  P   =  u,    v,    r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * Load up these values:
+       * - dudx   dudy   dvdx   dvdy   drdx   drdy
+       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+       */
+      msg_end = msg_lod;
+      for (unsigned i = 0; i < grad_components; i++) {
+         bld.MOV(msg_end, lod);
+         lod = offset(lod, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
+
+         bld.MOV(msg_end, lod2);
+         lod2 = offset(lod2, bld, 1);
+         msg_end = offset(msg_end, bld, 1);
+      }
+      break;
+   case SHADER_OPCODE_TXS:
+      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF:
+      msg_lod = offset(msg_coords, bld, 3);
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+      msg_lod = offset(msg_coords, bld, 3);
+      /* lod */
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+      /* sample index */
+      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+      msg_end = offset(msg_lod, bld, 2);
+      break;
+   default:
+      break;
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = message.reg;
+   inst->mlen = msg_end.reg - message.reg;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static bool
+is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
+{
+   if (devinfo->gen < 8 && !devinfo->is_haswell)
+      return false;
+
+   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+}
+
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
+                                fs_reg coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, fs_reg lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &mcs, const fs_reg &sampler,
+                                fs_reg offset_value,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   int reg_width = bld.dispatch_width() / 8;
+   unsigned header_size = 0, length = 0;
+   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+       offset_value.file != BAD_FILE ||
+       is_high_sampler(devinfo, sampler)) {
+      /* For general texture offsets (no txf workaround), we need a header to
+       * put them in.  Note that we're only reserving space for it in the
+       * message payload as it will be initialized implicitly by the
+       * generator.
+       *
+       * TG4 needs to place its channel select in the header, for interaction
+       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
+       * larger sampler numbers we need to offset the Sampler State Pointer in
+       * the header.
+       */
+      header_size = 1;
+      sources[0] = fs_reg();
+      length++;
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      bld.MOV(sources[length], shadow_c);
+      length++;
+   }
+
+   bool coordinate_done = false;
+
+   /* The sampler can only meaningfully compute LOD for fragment shader
+    * messages. For all other stages, we change the opcode to TXL and
+    * hardcode the LOD to 0.
+    */
+   if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
+       op == SHADER_OPCODE_TEX) {
+      op = SHADER_OPCODE_TXL;
+      lod = fs_reg(0.0f);
+   }
+
+   /* Set up the LOD info */
+   switch (op) {
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXL:
+      bld.MOV(sources[length], lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXD:
+      /* TXD should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* Load dPdx and the coordinate together:
+       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+
+         /* For cube map array, the coordinate is (u,v,r,ai) but there are
+          * only derivatives for (u, v, r).
+          */
+         if (i < grad_components) {
+            bld.MOV(sources[length], lod);
+            lod = offset(lod, bld, 1);
+            length++;
+
+            bld.MOV(sources[length], lod2);
+            lod2 = offset(lod2, bld, 1);
+            length++;
+         }
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXS:
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXF:
+      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+       * On Gen9 they are u, v, lod, r
+       */
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+      coordinate = offset(coordinate, bld, 1);
+      length++;
+
+      if (devinfo->gen >= 9) {
+         if (coord_components >= 2) {
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+            coordinate = offset(coordinate, bld, 1);
+         }
+         length++;
+      }
+
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+      length++;
+
+      for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+      if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+         length++;
+      }
+
+      if (op == SHADER_OPCODE_TXF_CMS) {
+         /* Data from the multisample control surface. */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+         length++;
+      }
+
+      /* There is no offsetting for this message; just copy in the integer
+       * texture coordinates.
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TG4_OFFSET:
+      /* gather4_po_c should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
+
+      /* More crazy intermixing */
+      for (unsigned i = 0; i < 2; i++) { /* u, v */
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      for (unsigned i = 0; i < 2; i++) { /* offu, offv */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
+         offset_value = offset(offset_value, bld, 1);
+         length++;
+      }
+
+      if (coord_components == 3) { /* r if present */
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+
+      coordinate_done = true;
+      break;
+   default:
+      break;
+   }
+
+   /* Set up the coordinate (except for cases where it was done above) */
+   if (!coordinate_done) {
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, bld, 1);
+         length++;
+      }
+   }
+
+   int mlen;
+   if (reg_width == 2)
+      mlen = length * reg_width - header_size;
+   else
+      mlen = length * reg_width;
+
+   const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
+                                     BRW_REGISTER_TYPE_F);
+   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+
+   /* Generate the SEND. */
+   inst->opcode = op;
+   inst->src[0] = src_payload;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = -1;
+   inst->mlen = mlen;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &coordinate = inst->src[0];
+   const fs_reg &shadow_c = inst->src[1];
+   const fs_reg &lod = inst->src[2];
+   const fs_reg &lod2 = inst->src[3];
+   const fs_reg &sample_index = inst->src[4];
+   const fs_reg &mcs = inst->src[5];
+   const fs_reg &sampler = inst->src[6];
+   const fs_reg &offset_value = inst->src[7];
+   assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
+   const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
+   const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+
+   if (devinfo->gen >= 7) {
+      lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      mcs, sampler, offset_value,
+                                      coord_components, grad_components);
+   } else if (devinfo->gen >= 5) {
+      lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      sampler, offset_value,
+                                      coord_components, grad_components);
+   } else {
+      lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sampler,
+                                      coord_components, grad_components);
+   }
+}
+
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+   fs_builder ubld = bld.exec_all().group(8, 0);
+   const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(dst, fs_reg(0));
+   ubld.MOV(component(dst, 7), sample_mask);
+   return dst;
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+                           const fs_reg &sample_mask)
+{
+   /* Get the logical send arguments. */
+   const fs_reg &addr = inst->src[0];
+   const fs_reg &src = inst->src[1];
+   const fs_reg &surface = inst->src[2];
+   const UNUSED fs_reg &dims = inst->src[3];
+   const fs_reg &arg = inst->src[4];
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(0);
+   const unsigned src_sz = inst->components_read(1);
+   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   const unsigned sz = header_sz + addr_sz + src_sz;
+
+   /* Allocate space for the payload. */
+   fs_reg *const components = new fs_reg[sz];
+   const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+   unsigned n = 0;
+
+   /* Construct the payload. */
+   if (header_sz)
+      components[n++] = emit_surface_header(bld, sample_mask);
+
+   for (unsigned i = 0; i < addr_sz; i++)
+      components[n++] = offset(addr, bld, i);
+
+   for (unsigned i = 0; i < src_sz; i++)
+      components[n++] = offset(src, bld, i);
+
+   bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+   /* Update the original instruction. */
+   inst->opcode = op;
+   inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+   inst->header_size = header_sz;
+
+   inst->src[0] = payload;
+   inst->src[1] = surface;
+   inst->src[2] = arg;
+   inst->resize_sources(3);
+
+   delete[] components;
+}
+
+bool
+fs_visitor::lower_logical_sends()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      switch (inst->opcode) {
+      case FS_OPCODE_FB_WRITE_LOGICAL:
+         assert(stage == MESA_SHADER_FRAGMENT);
+         lower_fb_write_logical_send(ibld, inst,
+                                     (const brw_wm_prog_data *)prog_data,
+                                     (const brw_wm_prog_key *)key,
+                                     payload);
+         break;
+
+      case SHADER_OPCODE_TEX_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+         break;
+
+      case SHADER_OPCODE_TXD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+         break;
+
+      case SHADER_OPCODE_TXF_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+         break;
+
+      case SHADER_OPCODE_TXL_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+         break;
+
+      case SHADER_OPCODE_TXS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+         break;
+
+      case FS_OPCODE_TXB_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+         break;
+
+      case SHADER_OPCODE_TXF_CMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+         break;
+
+      case SHADER_OPCODE_TXF_UMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+         break;
+
+      case SHADER_OPCODE_TXF_MCS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+         break;
+
+      case SHADER_OPCODE_LOD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+         break;
+
+      case SHADER_OPCODE_TG4_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+         break;
+
+      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+                                    fs_reg(0xffff));
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_READ,
+                                    fs_reg(0xffff));
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
+      default:
+         continue;
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct brw_device_info *devinfo,
+                       const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_CSEL:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_BFREV:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LZD:
+   case BRW_OPCODE_FBH:
+   case BRW_OPCODE_FBL:
+   case BRW_OPCODE_CBIT:
+   case BRW_OPCODE_SAD2:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS: {
+      /* According to the PRMs:
+       *  "A. In Direct Addressing mode, a source cannot span more than 2
+       *      adjacent GRF registers.
+       *   B. A destination cannot span more than 2 adjacent GRF registers."
+       *
+       * Look for the source or destination with the largest register region
+       * which is the one that is going to limit the overal execution size of
+       * the instruction due to this rule.
+       */
+      unsigned reg_count = inst->regs_written;
+
+      for (unsigned i = 0; i < inst->sources; i++)
+         reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
+
+      /* Calculate the maximum execution size of the instruction based on the
+       * factor by which it goes over the hardware limit of 2 GRFs.
+       */
+      return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
+   }
+   case SHADER_OPCODE_MULH:
+      /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
+       * is 8-wide on Gen7+.
+       */
+      return (devinfo->gen >= 7 ? 8 : inst->exec_size);
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
+       * here.
+       */
+      assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
+             inst->exec_size == 8);
+      /* Dual-source FB writes are unsupported in SIMD16 mode. */
+      return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
+
+   case SHADER_OPCODE_TXD_LOGICAL:
+      /* TXD is unsupported in SIMD16 mode. */
+      return 8;
+
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
+      /* gather4_po_c is unsupported in SIMD16 mode. */
+      const fs_reg &shadow_c = inst->src[1];
+      return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
+   }
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL: {
+      /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
+       * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
+       * mode because the message exceeds the maximum length of 11.
+       */
+      const fs_reg &shadow_c = inst->src[1];
+      if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
+         return 16;
+      else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
+         return 8;
+      else
+         return inst->exec_size;
+   }
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+      /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+       * messages.  Use SIMD16 instead.
+       */
+      if (devinfo->gen == 4)
+         return 16;
+      else
+         return inst->exec_size;
+
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return 8;
+
+   default:
+      return inst->exec_size;
+   }
+}
+
+/**
+ * The \p rows array of registers represents a \p num_rows by \p num_columns
+ * matrix in row-major order, write it in column-major order into the register
+ * passed as destination.  \p stride gives the separation between matrix
+ * elements in the input in fs_builder::dispatch_width() units.
+ */
+static void
+emit_transpose(const fs_builder &bld,
+               const fs_reg &dst, const fs_reg *rows,
+               unsigned num_rows, unsigned num_columns, unsigned stride)
+{
+   fs_reg *const components = new fs_reg[num_rows * num_columns];
+
+   for (unsigned i = 0; i < num_columns; ++i) {
+      for (unsigned j = 0; j < num_rows; ++j)
+         components[num_rows * i + j] = offset(rows[j], bld, stride * i);
+   }
+
+   bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
+
+   delete[] components;
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
+
+      if (lower_width != inst->exec_size) {
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
+         const fs_builder ibld = bld.at(block, inst)
+                                    .exec_all(inst->force_writemask_all)
+                                    .group(MAX2(inst->exec_size, lower_width),
+                                           inst->force_sechalf);
+
+         /* Split the copies in chunks of the execution width of either the
+          * original or the lowered instruction, whichever is lower.
+          */
+         const unsigned copy_width = MIN2(lower_width, inst->exec_size);
+         const unsigned n = inst->exec_size / copy_width;
+         const unsigned dst_size = inst->regs_written * REG_SIZE /
+            inst->dst.component_size(inst->exec_size);
+         fs_reg dsts[4];
+
+         assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
+                !inst->writes_accumulator && !inst->mlen);
+
+         for (unsigned i = 0; i < n; i++) {
+            /* Emit a copy of the original instruction with the lowered width.
+             * If the EOT flag was set throw it away except for the last
+             * instruction to avoid killing the thread prematurely.
+             */
+            fs_inst split_inst = *inst;
+            split_inst.exec_size = lower_width;
+            split_inst.eot = inst->eot && i == n - 1;
+
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
+             */
+            const fs_builder lbld = ibld.group(lower_width, i);
+
+            for (unsigned j = 0; j < inst->sources; j++) {
+               if (inst->src[j].file != BAD_FILE &&
+                   !is_uniform(inst->src[j])) {
+                  /* Get the i-th copy_width-wide chunk of the source. */
+                  const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+                  const unsigned src_size = inst->components_read(j);
+
+                  /* Use a trivial transposition to copy one every n
+                   * copy_width-wide components of the register into a
+                   * temporary passed as source to the lowered instruction.
+                   */
+                  split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
+                  emit_transpose(lbld.group(copy_width, 0),
+                                 split_inst.src[j], &src, 1, src_size, n);
+               }
+            }
+
+            if (inst->regs_written) {
+               /* Allocate enough space to hold the result of the lowered
+                * instruction and fix up the number of registers written.
+                */
+               split_inst.dst = dsts[i] =
+                  lbld.vgrf(inst->dst.type, dst_size);
+               split_inst.regs_written =
+                  DIV_ROUND_UP(inst->regs_written * lower_width,
+                               inst->exec_size);
+            }
+
+            lbld.emit(split_inst);
+         }
+
+         if (inst->regs_written) {
+            /* Distance between useful channels in the temporaries, skipping
+             * garbage if the lowered instruction is wider than the original.
+             */
+            const unsigned m = lower_width / copy_width;
+
+            /* Interleave the components of the result from the lowered
+             * instructions.  We need to set exec_all() when copying more than
+             * one half per component, because LOAD_PAYLOAD (in terms of which
+             * emit_transpose is implemented) can only use the same channel
+             * enable signals for all of its non-header sources.
+             */
+            emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
+                               .group(copy_width, 0),
+                           inst->dst, dsts, n, dst_size, m);
+         }
+
+         inst->remove(block);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::dump_instructions()
 {
@@ -3316,9 +4480,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    switch (inst->dst.file) {
    case GRF:
       fprintf(file, "vgrf%d", inst->dst.reg);
-      if (inst->dst.width != dispatch_width)
-         fprintf(file, "@%d", inst->dst.width);
-      if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
+      if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
           inst->dst.subreg_offset)
          fprintf(file, "+%d.%d",
                  inst->dst.reg_offset, inst->dst.subreg_offset);
@@ -3376,9 +4538,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       switch (inst->src[i].file) {
       case GRF:
          fprintf(file, "vgrf%d", inst->src[i].reg);
-         if (inst->src[i].width != dispatch_width)
-            fprintf(file, "@%d", inst->src[i].width);
-         if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
+         if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
              inst->src[i].subreg_offset)
             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                     inst->src[i].subreg_offset);
@@ -3655,9 +4815,11 @@ fs_visitor::optimize()
     * Ideally optimization passes wouldn't be part of the visitor so they
     * wouldn't have access to bld at all, but they do, so just in case some
     * pass forgets to ask for a location explicitly set it to NULL here to
-    * make it trip.
+    * make it trip.  The dispatch width is initialized to a bogus value to
+    * make sure that optimizations set the execution controls explicitly to
+    * match the code they are manipulating instead of relying on the defaults.
     */
-   bld = bld.at(NULL, NULL);
+   bld = fs_builder(this, 64);
 
    split_virtual_grfs();
 
@@ -3690,9 +4852,13 @@ fs_visitor::optimize()
       backend_shader::dump_instructions(filename);
    }
 
-   bool progress;
+   bool progress = false;
    int iteration = 0;
    int pass_num = 0;
+
+   OPT(lower_simd_width);
+   OPT(lower_logical_sends);
+
    do {
       progress = false;
       pass_num = 0;
@@ -3837,7 +5003,9 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
    if (failed)
       return false;
 
-   emit_urb_writes(clip_planes);
+   compute_clip_distance(clip_planes);
+
+   emit_urb_writes();
 
    if (shader_time_index >= 0)
       emit_shader_time_end();
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 243baf688de..975183e990d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -62,6 +62,27 @@ namespace brw {
    class fs_live_variables;
 }
 
+static inline fs_reg
+offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case GRF:
+   case MRF:
+   case HW_REG:
+   case ATTR:
+      return byte_offset(reg,
+                         delta * reg.component_size(bld.dispatch_width()));
+   case UNIFORM:
+      reg.reg_offset += delta;
+      break;
+   case IMM:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
 /**
  * The fragment shader front-end.
  *
@@ -161,7 +182,9 @@ public:
    void no16(const char *msg);
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
+   bool lower_logical_sends();
    bool lower_integer_multiplication();
+   bool lower_simd_width();
    bool opt_combine_constants();
 
    void emit_dummy_fs();
@@ -185,27 +208,6 @@ public:
    void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
    fs_reg rescale_texcoord(fs_reg coordinate, int coord_components,
                            bool is_rect, uint32_t sampler, int texunit);
-   fs_inst *emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              uint32_t sampler);
-   fs_inst *emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
-                                     fs_reg coordinate, int vector_elements,
-                                     fs_reg shadow_c, fs_reg lod,
-                                     uint32_t sampler);
-   fs_inst *emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, uint32_t sampler,
-                              bool has_offset);
-   fs_inst *emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_comp,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, fs_reg mcs, fs_reg sampler,
-                              fs_reg offset_value);
    void emit_texture(ir_texture_opcode op,
                      const glsl_type *dest_type,
                      fs_reg coordinate, int components,
@@ -220,9 +222,10 @@ public:
                      uint32_t sampler,
                      fs_reg sampler_reg,
                      int texunit);
-   fs_reg emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler);
+   fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+                         const fs_reg &sampler);
    void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
-   void resolve_source_modifiers(fs_reg *src);
+   fs_reg resolve_source_modifiers(const fs_reg &src);
    void emit_discard_jump();
    bool try_replace_with_sel();
    bool opt_peephole_sel();
@@ -249,6 +252,10 @@ public:
    void nir_emit_block(nir_block *block);
    void nir_emit_instr(nir_instr *instr);
    void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+   void nir_emit_load_const(const brw::fs_builder &bld,
+                            nir_load_const_instr *instr);
+   void nir_emit_undef(const brw::fs_builder &bld,
+                       nir_ssa_undef_instr *instr);
    void nir_emit_intrinsic(const brw::fs_builder &bld,
                            nir_intrinsic_instr *instr);
    void nir_emit_texture(const brw::fs_builder &bld,
@@ -257,21 +264,19 @@ public:
                       nir_jump_instr *instr);
    fs_reg get_nir_src(nir_src src);
    fs_reg get_nir_dest(nir_dest dest);
+   fs_reg get_nir_image_deref(const nir_deref_var *deref);
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
 
    bool optimize_frontfacing_ternary(nir_alu_instr *instr,
                                      const fs_reg &result);
 
-   void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
-                            unsigned exec_size, bool use_2nd_half);
    void emit_alpha_test();
    fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
                                  fs_reg color1, fs_reg color2,
-                                 fs_reg src0_alpha, unsigned components,
-                                 unsigned exec_size, bool use_2nd_half = false);
+                                 fs_reg src0_alpha, unsigned components);
    void emit_fb_writes();
-   void emit_urb_writes(gl_clip_plane *clip_planes);
+   void emit_urb_writes();
    void emit_cs_terminate();
 
    void emit_barrier();
@@ -282,16 +287,13 @@ public:
                         int shader_time_subindex,
                         fs_reg value);
 
-   void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                            fs_reg dst, fs_reg offset, fs_reg src0,
-                            fs_reg src1);
-
-   void emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
-                                  fs_reg offset);
-
    fs_reg get_timestamp(const brw::fs_builder &bld);
 
    struct brw_reg interp_reg(int location, int channel);
+
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n);
+
    int implied_mrf_writes(fs_inst *inst);
 
    virtual void dump_instructions();
@@ -345,7 +347,7 @@ public:
    unsigned max_grf;
 
    fs_reg *nir_locals;
-   fs_reg *nir_globals;
+   fs_reg *nir_ssa_values;
    fs_reg nir_inputs;
    fs_reg nir_outputs;
    fs_reg *nir_system_values;
@@ -359,7 +361,7 @@ public:
    fs_reg result;
 
    /** Register numbers for thread payload fields. */
-   struct {
+   struct thread_payload {
       uint8_t source_depth_reg;
       uint8_t source_w_reg;
       uint8_t aa_dest_stencil_reg;
@@ -468,10 +470,6 @@ private:
                                           struct brw_reg msg_data,
                                           unsigned msg_type);
 
-   void generate_set_omask(fs_inst *inst,
-                           struct brw_reg dst,
-                           struct brw_reg sample_mask);
-
    void generate_set_sample_id(fs_inst *inst,
                                struct brw_reg dst,
                                struct brw_reg src0,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 58ac5980da5..34545eaa0fb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -63,6 +63,22 @@ namespace brw {
       {
       }
 
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader
+       * before instruction \p inst in basic block \p block.  The default
+       * execution controls and debug annotation are initialized from the
+       * instruction passed as argument.
+       */
+      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
+         shader(shader), block(block), cursor(inst),
+         _dispatch_width(inst->exec_size),
+         _group(inst->force_sechalf ? 8 : 0),
+         force_writemask_all(inst->force_writemask_all)
+      {
+         annotation.str = inst->annotation;
+         annotation.ir = inst->ir;
+      }
+
       /**
        * Construct an fs_builder that inserts instructions before \p cursor in
        * basic block \p block, inheriting other code generation parameters
@@ -99,8 +115,8 @@ namespace brw {
       fs_builder
       group(unsigned n, unsigned i) const
       {
-         assert(n <= dispatch_width() &&
-                i < dispatch_width() / n);
+         assert(force_writemask_all ||
+                (n <= dispatch_width() && i < dispatch_width() / n));
          fs_builder bld = *this;
          bld._dispatch_width = n;
          bld._group += i * n;
@@ -160,10 +176,15 @@ namespace brw {
       dst_reg
       vgrf(enum brw_reg_type type, unsigned n = 1) const
       {
-         return dst_reg(GRF, shader->alloc.allocate(
-                           DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
-                                        REG_SIZE)),
-                        type, dispatch_width());
+         assert(dispatch_width() <= 32);
+
+         if (n > 0)
+            return dst_reg(GRF, shader->alloc.allocate(
+                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+                                           REG_SIZE)),
+                           type);
+         else
+            return retype(null_reg_ud(), type);
       }
 
       /**
@@ -235,7 +256,7 @@ namespace brw {
       instruction *
       emit(enum opcode opcode, const dst_reg &dst) const
       {
-         return emit(instruction(opcode, dst));
+         return emit(instruction(opcode, dispatch_width(), dst));
       }
 
       /**
@@ -253,11 +274,11 @@ namespace brw {
          case SHADER_OPCODE_SIN:
          case SHADER_OPCODE_COS:
             return fix_math_instruction(
-               emit(instruction(opcode, dst.width, dst,
+               emit(instruction(opcode, dispatch_width(), dst,
                                 fix_math_operand(src0))));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0));
+            return emit(instruction(opcode, dispatch_width(), dst, src0));
          }
       }
 
@@ -273,12 +294,12 @@ namespace brw {
          case SHADER_OPCODE_INT_QUOTIENT:
          case SHADER_OPCODE_INT_REMAINDER:
             return fix_math_instruction(
-               emit(instruction(opcode, dst.width, dst,
+               emit(instruction(opcode, dispatch_width(), dst,
                                 fix_math_operand(src0),
                                 fix_math_operand(src1))));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0, src1));
+            return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 
          }
       }
@@ -295,22 +316,35 @@ namespace brw {
          case BRW_OPCODE_BFI2:
          case BRW_OPCODE_MAD:
          case BRW_OPCODE_LRP:
-            return emit(instruction(opcode, dst.width, dst,
+            return emit(instruction(opcode, dispatch_width(), dst,
                                     fix_3src_operand(src0),
                                     fix_3src_operand(src1),
                                     fix_3src_operand(src2)));
 
          default:
-            return emit(instruction(opcode, dst.width, dst, src0, src1, src2));
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    src0, src1, src2));
          }
       }
 
+      /**
+       * Create and insert an instruction with a variable number of sources
+       * into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
+           unsigned n) const
+      {
+         return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
+      }
+
       /**
        * Insert a preallocated instruction into the program.
        */
       instruction *
       emit(instruction *inst) const
       {
+         assert(inst->exec_size <= 32);
          assert(inst->exec_size == dispatch_width() ||
                 force_writemask_all);
          assert(_group == 0 || _group == 8);
@@ -349,17 +383,19 @@ namespace brw {
       }
 
       /**
-       * Copy any live channel from \p src to the first channel of \p dst.
+       * Copy any live channel from \p src to the first channel of the result.
        */
-      void
-      emit_uniformize(const dst_reg &dst, const src_reg &src) const
+      src_reg
+      emit_uniformize(const src_reg &src) const
       {
          const fs_builder ubld = exec_all();
-         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+         const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
+         const dst_reg dst = component(vgrf(src.type), 0);
 
-         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
-         ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
-                   src, component(chan_index, 0));
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
+
+         return src_reg(dst);
       }
 
       /**
@@ -515,20 +551,10 @@ namespace brw {
       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
                    unsigned sources, unsigned header_size) const
       {
-         assert(dst.width % 8 == 0);
-         instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
-                                              dst.width, dst, src, sources));
+         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
          inst->header_size = header_size;
-
-         for (unsigned i = 0; i < header_size; i++)
-            assert(src[i].file != GRF ||
-                   src[i].width * type_sz(src[i].type) == 32);
-         inst->regs_written = header_size;
-
-         for (unsigned i = header_size; i < sources; ++i)
-            assert(src[i].file != GRF ||
-                   src[i].width == dst.width);
-         inst->regs_written += (sources - header_size) * (dst.width / 8);
+         inst->regs_written = header_size +
+                              (sources - header_size) * (dispatch_width() / 8);
 
          return inst;
       }
@@ -626,8 +652,8 @@ namespace brw {
                inst->resize_sources(1);
                inst->src[0] = src0;
 
-               at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
-                                          dispatch_width()), src1);
+               at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
+                                   src1);
             }
          }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index d0f61222e5a..a8883a35ef2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -243,6 +243,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_find_msb:
    case ir_unop_find_lsb:
    case ir_unop_saturate:
+   case ir_unop_subroutine_to_int:
       for (i = 0; i < vector_elements; i++) {
 	 ir_rvalue *op0 = get_element(op_var[0], i);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index 0af5a915c9f..c182232285e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -277,7 +277,7 @@ fs_visitor::opt_combine_constants()
        */
       exec_node *n = (imm->inst ? imm->inst :
                       imm->block->last_non_control_flow_inst()->next);
-      const fs_builder ibld = bld.at(imm->block, n).exec_all();
+      const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
 
       ibld.MOV(reg, fs_reg(imm->val));
       imm->reg = reg.reg;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index c92aae4b1d6..5445ad55670 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -339,6 +339,14 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    if (entry->src.stride * inst->src[arg].stride > 4)
       return false;
 
+   /* Bail if the instruction type is larger than the execution type of the
+    * copy, what implies that each channel is reading multiple channels of the
+    * destination of the copy, and simply replacing the sources would give a
+    * program with different semantics.
+    */
+   if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type))
+      return false;
+
    /* Bail if the result of composing both strides cannot be expressed
     * as another stride. This avoids, for example, trying to transform
     * this:
@@ -388,17 +396,14 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 
    switch (entry->src.file) {
    case UNIFORM:
-      assert(entry->src.width == 1);
    case BAD_FILE:
    case HW_REG:
-      inst->src[arg].width = entry->src.width;
       inst->src[arg].reg_offset = entry->src.reg_offset;
       inst->src[arg].subreg_offset = entry->src.subreg_offset;
       break;
    case ATTR:
    case GRF:
       {
-         assert(entry->src.width % inst->src[arg].width == 0);
          /* In this case, we'll just leave the width alone.  The source
           * register could have different widths depending on how it is
           * being used.  For instance, if only half of the register was
@@ -529,6 +534,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
 
       case BRW_OPCODE_MACH:
       case BRW_OPCODE_MUL:
+      case SHADER_OPCODE_MULH:
       case BRW_OPCODE_ADD:
       case BRW_OPCODE_OR:
       case BRW_OPCODE_AND:
@@ -715,7 +721,6 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
                entry->dst = inst->dst;
                entry->dst.reg_offset = offset;
-               entry->dst.width = effective_width;
                entry->src = inst->src[i];
                entry->regs_written = regs_written;
                entry->opcode = inst->opcode;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 70f0217b93d..c7628dcc2f4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -61,6 +61,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case BRW_OPCODE_CMPN:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_FRC:
    case BRW_OPCODE_RNDU:
    case BRW_OPCODE_RNDD:
@@ -179,9 +180,7 @@ static void
 create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
 {
    int written = inst->regs_written;
-   int dst_width = inst->dst.width / 8;
-   const fs_builder ubld = bld.group(inst->exec_size, inst->force_sechalf)
-                              .exec_all(inst->force_writemask_all);
+   int dst_width = inst->exec_size / 8;
    fs_inst *copy;
 
    if (written > dst_width) {
@@ -200,16 +199,15 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
       payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
       for (int i = 0; i < header_size; i++) {
          payload[i] = src;
-         payload[i].width = 8;
          src.reg_offset++;
       }
       for (int i = header_size; i < sources; i++) {
          payload[i] = src;
-         src = offset(src, 1);
+         src = offset(src, bld, 1);
       }
-      copy = ubld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
+      copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
    } else {
-      copy = ubld.MOV(inst->dst, src);
+      copy = bld.MOV(inst->dst, src);
       copy->src[0].negate = negate;
    }
    assert(copy->regs_written == written);
@@ -259,15 +257,14 @@ fs_visitor::opt_cse_local(bblock_t *block)
              */
             bool no_existing_temp = entry->tmp.file == BAD_FILE;
             if (no_existing_temp && !entry->generator->dst.is_null()) {
+               const fs_builder ibld = fs_builder(this, block, entry->generator)
+                                       .at(block, entry->generator->next);
                int written = entry->generator->regs_written;
-               assert((written * 8) % entry->generator->dst.width == 0);
 
                entry->tmp = fs_reg(GRF, alloc.allocate(written),
-                                   entry->generator->dst.type,
-                                   entry->generator->dst.width);
+                                   entry->generator->dst.type);
 
-               create_copy_instr(bld.at(block, entry->generator->next),
-                                 entry->generator, entry->tmp, false);
+               create_copy_instr(ibld, entry->generator, entry->tmp, false);
 
                entry->generator->dst = entry->tmp;
             }
@@ -275,10 +272,10 @@ fs_visitor::opt_cse_local(bblock_t *block)
             /* dest <- temp */
             if (!inst->dst.is_null()) {
                assert(inst->regs_written == entry->generator->regs_written);
-               assert(inst->dst.width == entry->generator->dst.width);
                assert(inst->dst.type == entry->tmp.type);
+               const fs_builder ibld(this, block, inst);
 
-               create_copy_instr(bld.at(block, inst), inst, entry->tmp, negate);
+               create_copy_instr(ibld, inst, entry->tmp, negate);
             }
 
             /* Set our iterator so that next time through the loop inst->next
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 2ed0bac6fd9..c86ca043b63 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -48,7 +48,7 @@ static uint32_t brw_file_from_reg(fs_reg *reg)
 }
 
 static struct brw_reg
-brw_reg_from_fs_reg(fs_reg *reg)
+brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg)
 {
    struct brw_reg brw_reg;
 
@@ -57,10 +57,10 @@ brw_reg_from_fs_reg(fs_reg *reg)
    case MRF:
       if (reg->stride == 0) {
          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
-      } else if (reg->width < 8) {
+      } else if (inst->exec_size < 8) {
          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
-         brw_reg = stride(brw_reg, reg->width * reg->stride,
-                          reg->width, reg->stride);
+         brw_reg = stride(brw_reg, inst->exec_size * reg->stride,
+                          inst->exec_size, reg->stride);
       } else {
          /* From the Haswell PRM:
           *
@@ -79,6 +79,10 @@ brw_reg_from_fs_reg(fs_reg *reg)
       brw_reg = byte_offset(brw_reg, reg->subreg_offset);
       break;
    case IMM:
+      assert(reg->stride == ((reg->type == BRW_REGISTER_TYPE_V ||
+                              reg->type == BRW_REGISTER_TYPE_UV ||
+                              reg->type == BRW_REGISTER_TYPE_VF) ? 1 : 0));
+
       switch (reg->type) {
       case BRW_REGISTER_TYPE_F:
 	 brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
@@ -217,11 +221,11 @@ fs_generator::fire_fb_write(fs_inst *inst,
    if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
    else if (prog_data->dual_src_blend) {
-      if (dispatch_width == 8 || !inst->eot)
+      if (!inst->force_sechalf)
          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
       else
          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
-   } else if (dispatch_width == 16)
+   } else if (inst->exec_size == 16)
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
    else
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
@@ -414,7 +418,7 @@ fs_generator::generate_blorp_fb_write(fs_inst *inst)
    brw_fb_WRITE(p,
                 16 /* dispatch_width */,
                 brw_message_reg(inst->base_mrf),
-                brw_reg_from_fs_reg(&inst->src[0]),
+                brw_reg_from_fs_reg(inst, &inst->src[0]),
                 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
                 inst->target,
                 inst->mlen,
@@ -651,7 +655,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 /* Note that G45 and older determines shadow compare and dispatch width
 	  * from message length for most messages.
 	  */
-         if (dispatch_width == 8) {
+         if (inst->exec_size == 8) {
             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
             if (inst->shadow_compare) {
                assert(inst->mlen == 6);
@@ -670,7 +674,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case FS_OPCODE_TXB:
 	 if (inst->shadow_compare) {
-            assert(dispatch_width == 8);
+            assert(inst->exec_size == 8);
 	    assert(inst->mlen == 6);
 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 	 } else {
@@ -681,7 +685,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case SHADER_OPCODE_TXL:
 	 if (inst->shadow_compare) {
-            assert(dispatch_width == 8);
+            assert(inst->exec_size == 8);
 	    assert(inst->mlen == 6);
 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 	 } else {
@@ -692,7 +696,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 	 break;
       case SHADER_OPCODE_TXD:
 	 /* There is no sample_d_c message; comparisons are done manually */
-         assert(dispatch_width == 8);
+         assert(inst->exec_size == 8);
 	 assert(inst->mlen == 7 || inst->mlen == 10);
 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 	 break;
@@ -1054,7 +1058,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                                                        struct brw_reg index,
                                                        struct brw_reg offset)
 {
-   assert(inst->mlen == 0);
    assert(index.type == BRW_REGISTER_TYPE_UD);
 
    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
@@ -1069,12 +1072,10 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
 
    struct brw_reg src = offset;
    bool header_present = false;
-   int mlen = 1;
 
    if (devinfo->gen >= 9) {
       /* Skylake requires a message header in order to use SIMD4x2 mode. */
-      src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD);
-      mlen = 2;
+      src = retype(brw_vec4_grf(offset.nr, 0), BRW_REGISTER_TYPE_UD);
       header_present = true;
 
       brw_push_insn_state(p);
@@ -1105,7 +1106,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
-                              mlen,
+                              inst->mlen,
                               header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
@@ -1135,7 +1136,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
-                              mlen,
+                              inst->mlen,
                               header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
@@ -1363,37 +1364,6 @@ fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
    brw_pop_insn_state(p);
 }
 
-/* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
- * (when mask is passed as a uniform) of register mask before moving it
- * to register dst.
- */
-void
-fs_generator::generate_set_omask(fs_inst *inst,
-                                 struct brw_reg dst,
-                                 struct brw_reg mask)
-{
-   bool stride_8_8_1 =
-    (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
-     mask.width == BRW_WIDTH_8 &&
-     mask.hstride == BRW_HORIZONTAL_STRIDE_1);
-
-   bool stride_0_1_0 = has_scalar_region(mask);
-
-   assert(stride_8_8_1 || stride_0_1_0);
-   assert(dst.type == BRW_REGISTER_TYPE_UW);
-
-   brw_push_insn_state(p);
-   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-
-   if (stride_8_8_1) {
-      brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
-   } else if (stride_0_1_0) {
-      brw_MOV(p, dst, retype(mask, dst.type));
-   }
-   brw_pop_insn_state(p);
-}
-
 /* Sets vstride=1, width=4, hstride=0 of register src1 during
  * the ADD instruction.
  */
@@ -1563,7 +1533,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
 
       for (unsigned int i = 0; i < inst->sources; i++) {
-	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
+	 src[i] = brw_reg_from_fs_reg(inst, &inst->src[i]);
 
 	 /* The accumulator result appears to get used for the
 	  * conditional modifier generation.  When negating a UD
@@ -1575,7 +1545,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
 		!inst->src[i].negate);
       }
-      dst = brw_reg_from_fs_reg(&inst->dst);
+      dst = brw_reg_from_fs_reg(inst, &inst->dst);
 
       brw_set_default_predicate_control(p, inst->predicate);
       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
@@ -1604,7 +1574,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          /* If the instruction writes to more than one register, it needs to
           * be a "compressed" instruction on Gen <= 5.
           */
-         if (inst->exec_size * inst->dst.stride * type_sz(inst->dst.type) > 32)
+         if (inst->dst.component_size(inst->exec_size) > REG_SIZE)
             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
          else
             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1872,7 +1842,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 	 break;
 
       case BRW_OPCODE_DO:
-	 brw_DO(p, BRW_EXECUTE_8);
+	 brw_DO(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
 	 break;
 
       case BRW_OPCODE_BREAK:
@@ -2019,19 +1989,15 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud,
                             inst->mlen, !inst->dst.is_null());
-         brw_mark_surface_used(prog_data, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1],
                                   inst->mlen, src[2].dw1.ud);
-         brw_mark_surface_used(prog_data, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -2073,10 +2039,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
-      case FS_OPCODE_SET_OMASK:
-         generate_set_omask(inst, dst, src[0]);
-         break;
-
       case FS_OPCODE_SET_SAMPLE_ID:
          generate_set_sample_id(inst, dst, src[0], src[1]);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index 502161d5128..19aec92fad1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -204,27 +204,9 @@ fs_live_variables::compute_live_variables()
    while (cont) {
       cont = false;
 
-      foreach_block (block, cfg) {
+      foreach_block_reverse (block, cfg) {
          struct block_data *bd = &block_data[block->num];
 
-	 /* Update livein */
-	 for (int i = 0; i < bitset_words; i++) {
-            BITSET_WORD new_livein = (bd->use[i] |
-                                      (bd->liveout[i] &
-                                       ~bd->def[i]));
-	    if (new_livein & ~bd->livein[i]) {
-               bd->livein[i] |= new_livein;
-               cont = true;
-	    }
-	 }
-         BITSET_WORD new_livein = (bd->flag_use[0] |
-                                   (bd->flag_liveout[0] &
-                                    ~bd->flag_def[0]));
-         if (new_livein & ~bd->flag_livein[0]) {
-            bd->flag_livein[0] |= new_livein;
-            cont = true;
-         }
-
 	 /* Update liveout */
 	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
             struct block_data *child_bd = &block_data[child_link->block->num];
@@ -244,6 +226,24 @@ fs_live_variables::compute_live_variables()
                cont = true;
             }
 	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
       }
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 4d98b048433..93a36cc03bf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -24,8 +24,10 @@
 #include "glsl/ir.h"
 #include "glsl/ir_optimization.h"
 #include "glsl/nir/glsl_to_nir.h"
+#include "main/shaderimage.h"
 #include "program/prog_to_nir.h"
 #include "brw_fs.h"
+#include "brw_fs_surface_builder.h"
 #include "brw_nir.h"
 
 using namespace brw;
@@ -38,31 +40,11 @@ fs_visitor::emit_nir_code()
    /* emit the arrays used for inputs and outputs - load/store intrinsics will
     * be converted to reads/writes of these arrays
     */
-
-   if (nir->num_inputs > 0) {
-      nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
-      nir_setup_inputs(nir);
-   }
-
-   if (nir->num_outputs > 0) {
-      nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
-      nir_setup_outputs(nir);
-   }
-
-   if (nir->num_uniforms > 0) {
-      nir_setup_uniforms(nir);
-   }
-
+   nir_setup_inputs(nir);
+   nir_setup_outputs(nir);
+   nir_setup_uniforms(nir);
    nir_emit_system_values(nir);
 
-   nir_globals = ralloc_array(mem_ctx, fs_reg, nir->reg_alloc);
-   foreach_list_typed(nir_register, reg, node, &nir->registers) {
-      unsigned array_elems =
-         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
-      unsigned size = array_elems * reg->num_components;
-      nir_globals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
-   }
-
    /* get the main function and emit it */
    nir_foreach_overload(nir, overload) {
       assert(strcmp(overload->function->name, "main") == 0);
@@ -74,9 +56,11 @@ fs_visitor::emit_nir_code()
 void
 fs_visitor::nir_setup_inputs(nir_shader *shader)
 {
+   nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, shader->num_inputs);
+
    foreach_list_typed(nir_variable, var, node, &shader->inputs) {
       enum brw_reg_type type = brw_type_for_base_type(var->type);
-      fs_reg input = offset(nir_inputs, var->data.driver_location);
+      fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
 
       fs_reg reg;
       switch (stage) {
@@ -91,25 +75,35 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
           * So, we need to copy from fs_reg(ATTR, var->location) to
           * offset(nir_inputs, var->data.driver_location).
           */
-         unsigned components = var->type->without_array()->components();
+         const glsl_type *const t = var->type->without_array();
+         const unsigned components = t->components();
+         const unsigned cols = t->matrix_columns;
+         const unsigned elts = t->vector_elements;
          unsigned array_length = var->type->is_array() ? var->type->length : 1;
          for (unsigned i = 0; i < array_length; i++) {
-            for (unsigned j = 0; j < components; j++) {
-               bld.MOV(retype(offset(input, components * i + j), type),
-                       offset(fs_reg(ATTR, var->data.location + i, type), j));
+            for (unsigned j = 0; j < cols; j++) {
+               for (unsigned k = 0; k < elts; k++) {
+                  bld.MOV(offset(retype(input, type), bld,
+                                 components * i + elts * j + k),
+                          offset(fs_reg(ATTR, var->data.location + i, type),
+                                 bld, 4 * j + k));
+               }
             }
          }
          break;
       }
       case MESA_SHADER_GEOMETRY:
       case MESA_SHADER_COMPUTE:
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
          unreachable("fs_visitor not used for these stages yet.");
          break;
       case MESA_SHADER_FRAGMENT:
          if (var->data.location == VARYING_SLOT_POS) {
             reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
                                                 var->data.origin_upper_left);
-            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, input, reg), 0xF);
+            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
+                                      input, reg), 0xF);
          } else {
             emit_general_interpolation(input, var->name, var->type,
                                        (glsl_interp_qualifier) var->data.interpolation,
@@ -126,45 +120,54 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 {
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
+   nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, shader->num_outputs);
+
    foreach_list_typed(nir_variable, var, node, &shader->outputs) {
-      fs_reg reg = offset(nir_outputs, var->data.driver_location);
+      fs_reg reg = offset(nir_outputs, bld, var->data.driver_location);
 
       int vector_elements =
          var->type->is_array() ? var->type->fields.array->vector_elements
                                : var->type->vector_elements;
 
-      if (stage == MESA_SHADER_VERTEX) {
+      switch (stage) {
+      case MESA_SHADER_VERTEX:
          for (int i = 0; i < ALIGN(type_size(var->type), 4) / 4; i++) {
             int output = var->data.location + i;
-            this->outputs[output] = offset(reg, 4 * i);
+            this->outputs[output] = offset(reg, bld, 4 * i);
             this->output_components[output] = vector_elements;
          }
-      } else if (var->data.index > 0) {
-         assert(var->data.location == FRAG_RESULT_DATA0);
-         assert(var->data.index == 1);
-         this->dual_src_output = reg;
-         this->do_dual_src = true;
-      } else if (var->data.location == FRAG_RESULT_COLOR) {
-         /* Writing gl_FragColor outputs to all color regions. */
-         for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
-            this->outputs[i] = reg;
-            this->output_components[i] = 4;
-         }
-      } else if (var->data.location == FRAG_RESULT_DEPTH) {
-         this->frag_depth = reg;
-      } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
-         this->sample_mask = reg;
-      } else {
-         /* gl_FragData or a user-defined FS output */
-         assert(var->data.location >= FRAG_RESULT_DATA0 &&
-                var->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
+         break;
+      case MESA_SHADER_FRAGMENT:
+         if (var->data.index > 0) {
+            assert(var->data.location == FRAG_RESULT_DATA0);
+            assert(var->data.index == 1);
+            this->dual_src_output = reg;
+            this->do_dual_src = true;
+         } else if (var->data.location == FRAG_RESULT_COLOR) {
+            /* Writing gl_FragColor outputs to all color regions. */
+            for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
+               this->outputs[i] = reg;
+               this->output_components[i] = 4;
+            }
+         } else if (var->data.location == FRAG_RESULT_DEPTH) {
+            this->frag_depth = reg;
+         } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
+            this->sample_mask = reg;
+         } else {
+            /* gl_FragData or a user-defined FS output */
+            assert(var->data.location >= FRAG_RESULT_DATA0 &&
+                   var->data.location < FRAG_RESULT_DATA0+BRW_MAX_DRAW_BUFFERS);
 
-         /* General color output. */
-         for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
-            int output = var->data.location - FRAG_RESULT_DATA0 + i;
-            this->outputs[output] = offset(reg, vector_elements * i);
-            this->output_components[output] = vector_elements;
+            /* General color output. */
+            for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
+               int output = var->data.location - FRAG_RESULT_DATA0 + i;
+               this->outputs[output] = offset(reg, bld, vector_elements * i);
+               this->output_components[output] = vector_elements;
+            }
          }
+         break;
+      default:
+         unreachable("unhandled shader stage");
       }
    }
 }
@@ -172,18 +175,20 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 void
 fs_visitor::nir_setup_uniforms(nir_shader *shader)
 {
-   uniforms = shader->num_uniforms;
    num_direct_uniforms = shader->num_direct_uniforms;
 
+   if (dispatch_width != 8)
+      return;
+
    /* We split the uniform register file in half.  The first half is
     * entirely direct uniforms.  The second half is indirect.
     */
-   param_size[0] = num_direct_uniforms;
+   if (num_direct_uniforms > 0)
+      param_size[0] = num_direct_uniforms;
    if (shader->num_uniforms > num_direct_uniforms)
       param_size[num_direct_uniforms] = shader->num_uniforms - num_direct_uniforms;
 
-   if (dispatch_width != 8)
-      return;
+   uniforms = shader->num_uniforms;
 
    if (shader_prog) {
       foreach_list_typed(nir_variable, var, node, &shader->uniforms) {
@@ -233,17 +238,26 @@ fs_visitor::nir_setup_uniform(nir_variable *var)
          continue;
       }
 
-      unsigned slots = storage->type->component_slots();
-      if (storage->array_elements)
-         slots *= storage->array_elements;
+      if (storage->type->is_image()) {
+         /* Images don't get a valid location assigned by nir_lower_io()
+          * because their size is driver-specific, so we need to allocate
+          * space for them here at the end of the parameter array.
+          */
+         var->data.driver_location = uniforms;
+         param_size[uniforms] =
+            BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1);
 
-      for (unsigned i = 0; i < slots; i++) {
-         stage_prog_data->param[index++] = &storage->storage[i];
+         setup_image_uniform_values(storage);
+      } else {
+         unsigned slots = storage->type->component_slots();
+         if (storage->array_elements)
+            slots *= storage->array_elements;
+
+         for (unsigned i = 0; i < slots; i++) {
+            stage_prog_data->param[index++] = &storage->storage[i];
+         }
       }
    }
-
-   /* Make sure we actually initialized the right amount of stuff here. */
-   assert(var->data.driver_location + var->type->component_slots() == index);
 }
 
 void
@@ -366,6 +380,9 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
       nir_locals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
    }
 
+   nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
+                             impl->ssa_alloc);
+
    nir_emit_cf_list(&impl->body);
 }
 
@@ -413,18 +430,12 @@ fs_visitor::nir_emit_if(nir_if *if_stmt)
 
    bld.emit(BRW_OPCODE_ENDIF);
 
-   if (!try_replace_with_sel() && devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
+   try_replace_with_sel();
 }
 
 void
 fs_visitor::nir_emit_loop(nir_loop *loop)
 {
-   if (devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-
    bld.emit(BRW_OPCODE_DO);
 
    nir_emit_cf_list(&loop->body);
@@ -459,9 +470,11 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       break;
 
    case nir_instr_type_load_const:
-      /* We can hit these, but we do nothing now and use them as
-       * immediates later.
-       */
+      nir_emit_load_const(abld, nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_ssa_undef:
+      nir_emit_undef(abld, nir_instr_as_ssa_undef(instr));
       break;
 
    case nir_instr_type_jump:
@@ -473,39 +486,16 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
    }
 }
 
-static brw_reg_type
-brw_type_for_nir_type(nir_alu_type type)
-{
-   switch (type) {
-   case nir_type_unsigned:
-      return BRW_REGISTER_TYPE_UD;
-   case nir_type_bool:
-   case nir_type_int:
-      return BRW_REGISTER_TYPE_D;
-   case nir_type_float:
-      return BRW_REGISTER_TYPE_F;
-   default:
-      unreachable("unknown type");
-   }
-
-   return BRW_REGISTER_TYPE_F;
-}
-
 bool
 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
                                          const fs_reg &result)
 {
-   if (instr->src[0].src.is_ssa ||
-       !instr->src[0].src.reg.reg ||
-       !instr->src[0].src.reg.reg->parent_instr)
-      return false;
-
-   if (instr->src[0].src.reg.reg->parent_instr->type !=
-       nir_instr_type_intrinsic)
+   if (!instr->src[0].src.is_ssa ||
+       instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
       return false;
 
    nir_intrinsic_instr *src0 =
-      nir_instr_as_intrinsic(instr->src[0].src.reg.reg->parent_instr);
+      nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 
    if (src0->intrinsic != nir_intrinsic_load_front_face)
       return false;
@@ -618,11 +608,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
             continue;
 
          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
-            inst = bld.MOV(offset(temp, i),
-                           offset(op[0], instr->src[0].swizzle[i]));
+            inst = bld.MOV(offset(temp, bld, i),
+                           offset(op[0], bld, instr->src[0].swizzle[i]));
          } else {
-            inst = bld.MOV(offset(temp, i),
-                           offset(op[i], instr->src[i].swizzle[0]));
+            inst = bld.MOV(offset(temp, bld, i),
+                           offset(op[i], bld, instr->src[i].swizzle[0]));
          }
          inst->saturate = instr->dest.saturate;
       }
@@ -636,7 +626,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
             if (!(instr->dest.write_mask & (1 << i)))
                continue;
 
-            bld.MOV(offset(result, i), offset(temp, i));
+            bld.MOV(offset(result, bld, i), offset(temp, bld, i));
          }
       }
       return;
@@ -657,12 +647,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
       channel = ffs(instr->dest.write_mask) - 1;
 
-      result = offset(result, channel);
+      result = offset(result, bld, channel);
    }
 
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
-      op[i] = offset(op[i], instr->src[i].swizzle[channel]);
+      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
    }
 
    switch (instr->op) {
@@ -788,67 +778,20 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_imul_high:
-   case nir_op_umul_high: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
-
-      fs_inst *mul = bld.MUL(acc, op[0], op[1]);
-      bld.MACH(result, op[0], op[1]);
-
-      /* Until Gen8, integer multiplies read 32-bits from one source, and
-       * 16-bits from the other, and relying on the MACH instruction to
-       * generate the high bits of the result.
-       *
-       * On Gen8, the multiply instruction does a full 32x32-bit multiply,
-       * but in order to do a 64x64-bit multiply we have to simulate the
-       * previous behavior and then use a MACH instruction.
-       *
-       * FINISHME: Don't use source modifiers on src1.
-       */
-      if (devinfo->gen >= 8) {
-         assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
-                mul->src[1].type == BRW_REGISTER_TYPE_UD);
-         if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
-            mul->src[1].type = BRW_REGISTER_TYPE_W;
-            mul->src[1].stride = 2;
-         } else {
-            mul->src[1].type = BRW_REGISTER_TYPE_UW;
-            mul->src[1].stride = 2;
-         }
-      }
+   case nir_op_umul_high:
+      bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
       break;
-   }
 
    case nir_op_idiv:
    case nir_op_udiv:
       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
       break;
 
-   case nir_op_uadd_carry: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
+   case nir_op_uadd_carry:
+      unreachable("Should have been lowered by carry_to_arith().");
 
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      bld.ADDC(bld.null_reg_ud(), op[0], op[1]);
-      bld.MOV(result, fs_reg(acc));
-      break;
-   }
-
-   case nir_op_usub_borrow: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      bld.SUBB(bld.null_reg_ud(), op[0], op[1]);
-      bld.MOV(result, fs_reg(acc));
-      break;
-   }
+   case nir_op_usub_borrow:
+      unreachable("Should have been lowered by borrow_to_arith().");
 
    case nir_op_umod:
       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
@@ -878,28 +821,28 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 
    case nir_op_inot:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
+         op[0] = resolve_source_modifiers(op[0]);
       }
       bld.NOT(result, op[0]);
       break;
    case nir_op_ixor:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.XOR(result, op[0], op[1]);
       break;
    case nir_op_ior:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.OR(result, op[0], op[1]);
       break;
    case nir_op_iand:
       if (devinfo->gen >= 8) {
-         resolve_source_modifiers(&op[0]);
-         resolve_source_modifiers(&op[1]);
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
       }
       bld.AND(result, op[0], op[1]);
       break;
@@ -959,10 +902,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_b2i:
-      bld.AND(result, op[0], fs_reg(1));
-      break;
    case nir_op_b2f:
-      bld.AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], fs_reg(0x3f800000u));
+      bld.MOV(result, negate(op[0]));
       break;
 
    case nir_op_f2b:
@@ -1146,17 +1087,36 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    }
 }
 
+void
+fs_visitor::nir_emit_load_const(const fs_builder &bld,
+                                nir_load_const_instr *instr)
+{
+   fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
+
+   for (unsigned i = 0; i < instr->def.num_components; i++)
+      bld.MOV(offset(reg, bld, i), fs_reg(instr->value.i[i]));
+
+   nir_ssa_values[instr->def.index] = reg;
+}
+
+void
+fs_visitor::nir_emit_undef(const fs_builder &bld, nir_ssa_undef_instr *instr)
+{
+   nir_ssa_values[instr->def.index] = bld.vgrf(BRW_REGISTER_TYPE_D,
+                                               instr->def.num_components);
+}
+
 static fs_reg
 fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
                    unsigned base_offset, nir_src *indirect)
 {
    fs_reg reg;
-   if (nir_reg->is_global)
-      reg = v->nir_globals[nir_reg->index];
-   else
-      reg = v->nir_locals[nir_reg->index];
 
-   reg = offset(reg, base_offset * nir_reg->num_components);
+   assert(!nir_reg->is_global);
+
+   reg = v->nir_locals[nir_reg->index];
+
+   reg = offset(reg, v->bld, base_offset * nir_reg->num_components);
    if (indirect) {
       int multiplier = nir_reg->num_components * (v->dispatch_width / 8);
 
@@ -1171,34 +1131,77 @@ fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
 fs_reg
 fs_visitor::get_nir_src(nir_src src)
 {
+   fs_reg reg;
    if (src.is_ssa) {
-      assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
-      nir_load_const_instr *load = nir_instr_as_load_const(src.ssa->parent_instr);
-      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, src.ssa->num_components);
-
-      for (unsigned i = 0; i < src.ssa->num_components; ++i)
-         bld.MOV(offset(reg, i), fs_reg(load->value.i[i]));
-
-      return reg;
+      reg = nir_ssa_values[src.ssa->index];
    } else {
-      fs_reg reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
-                                      src.reg.indirect);
-
-      /* to avoid floating-point denorm flushing problems, set the type by
-       * default to D - instructions that need floating point semantics will set
-       * this to F if they need to
-       */
-      return retype(reg, BRW_REGISTER_TYPE_D);
+      reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
+                               src.reg.indirect);
    }
+
+   /* to avoid floating-point denorm flushing problems, set the type by
+    * default to D - instructions that need floating point semantics will set
+    * this to F if they need to
+    */
+   return retype(reg, BRW_REGISTER_TYPE_D);
 }
 
 fs_reg
 fs_visitor::get_nir_dest(nir_dest dest)
 {
+   if (dest.is_ssa) {
+      nir_ssa_values[dest.ssa.index] = bld.vgrf(BRW_REGISTER_TYPE_F,
+                                                dest.ssa.num_components);
+      return nir_ssa_values[dest.ssa.index];
+   }
+
    return fs_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
                              dest.reg.indirect);
 }
 
+fs_reg
+fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
+{
+   fs_reg image(UNIFORM, deref->var->data.driver_location,
+                BRW_REGISTER_TYPE_UD);
+
+   if (deref->deref.child) {
+      const nir_deref_array *deref_array =
+         nir_deref_as_array(deref->deref.child);
+      assert(deref->deref.child->deref_type == nir_deref_type_array &&
+             deref_array->deref.child == NULL);
+      const unsigned size = glsl_get_length(deref->var->type);
+      const unsigned base = MIN2(deref_array->base_offset, size - 1);
+
+      image = offset(image, bld, base * BRW_IMAGE_PARAM_SIZE);
+
+      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+         fs_reg *tmp = new(mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+
+         if (devinfo->gen == 7 && !devinfo->is_haswell) {
+            /* IVB hangs when trying to access an invalid surface index with
+             * the dataport.  According to the spec "if the index used to
+             * select an individual element is negative or greater than or
+             * equal to the size of the array, the results of the operation
+             * are undefined but may not lead to termination" -- which is one
+             * of the possible outcomes of the hang.  Clamp the index to
+             * prevent access outside of the array bounds.
+             */
+            bld.emit_minmax(*tmp, retype(get_nir_src(deref_array->indirect),
+                                         BRW_REGISTER_TYPE_UD),
+                            fs_reg(size - base - 1), BRW_CONDITIONAL_L);
+         } else {
+            bld.MOV(*tmp, get_nir_src(deref_array->indirect));
+         }
+
+         bld.MUL(*tmp, *tmp, fs_reg(BRW_IMAGE_PARAM_SIZE));
+         image.reladdr = tmp;
+      }
+   }
+
+   return image;
+}
+
 void
 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
                          unsigned wr_mask)
@@ -1208,15 +1211,64 @@ fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
          continue;
 
       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
-      new_inst->dst = offset(new_inst->dst, i);
+      new_inst->dst = offset(new_inst->dst, bld, i);
       for (unsigned j = 0; j < new_inst->sources; j++)
          if (new_inst->src[j].file == GRF)
-            new_inst->src[j] = offset(new_inst->src[j], i);
+            new_inst->src[j] = offset(new_inst->src[j], bld, i);
 
       bld.emit(new_inst);
    }
 }
 
+/**
+ * Get the matching channel register datatype for an image intrinsic of the
+ * specified GLSL image type.
+ */
+static brw_reg_type
+get_image_base_type(const glsl_type *type)
+{
+   switch ((glsl_base_type)type->sampler_type) {
+   case GLSL_TYPE_UINT:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_INT:
+      return BRW_REGISTER_TYPE_D;
+   case GLSL_TYPE_FLOAT:
+      return BRW_REGISTER_TYPE_F;
+   default:
+      unreachable("Not reached.");
+   }
+}
+
+/**
+ * Get the appropriate atomic op for an image atomic intrinsic.
+ */
+static unsigned
+get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
+{
+   switch (op) {
+   case nir_intrinsic_image_atomic_add:
+      return BRW_AOP_ADD;
+   case nir_intrinsic_image_atomic_min:
+      return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+              BRW_AOP_IMIN : BRW_AOP_UMIN);
+   case nir_intrinsic_image_atomic_max:
+      return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+              BRW_AOP_IMAX : BRW_AOP_UMAX);
+   case nir_intrinsic_image_atomic_and:
+      return BRW_AOP_AND;
+   case nir_intrinsic_image_atomic_or:
+      return BRW_AOP_OR;
+   case nir_intrinsic_image_atomic_xor:
+      return BRW_AOP_XOR;
+   case nir_intrinsic_image_atomic_exchange:
+      return BRW_AOP_MOV;
+   case nir_intrinsic_image_atomic_comp_swap:
+      return BRW_AOP_CMPWR;
+   default:
+      unreachable("Not reachable.");
+   }
+}
+
 void
 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
 {
@@ -1255,25 +1307,102 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_atomic_counter_inc:
    case nir_intrinsic_atomic_counter_dec:
    case nir_intrinsic_atomic_counter_read: {
-      unsigned surf_index = prog_data->binding_table.abo_start +
-                            (unsigned) instr->const_index[0];
-      fs_reg offset = fs_reg(get_nir_src(instr->src[0]));
+      using namespace surface_access;
 
+      /* Get the arguments of the atomic intrinsic. */
+      const fs_reg offset = get_nir_src(instr->src[0]);
+      const unsigned surface = (stage_prog_data->binding_table.abo_start +
+                                instr->const_index[0]);
+      fs_reg tmp;
+
+      /* Emit a surface read or atomic op. */
       switch (instr->intrinsic) {
-         case nir_intrinsic_atomic_counter_inc:
-            emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
-                                fs_reg(), fs_reg());
-            break;
-         case nir_intrinsic_atomic_counter_dec:
-            emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
-                                fs_reg(), fs_reg());
-            break;
-         case nir_intrinsic_atomic_counter_read:
-            emit_untyped_surface_read(surf_index, dest, offset);
-            break;
-         default:
-            unreachable("Unreachable");
+      case nir_intrinsic_atomic_counter_read:
+         tmp = emit_untyped_read(bld, fs_reg(surface), offset, 1, 1);
+         break;
+
+      case nir_intrinsic_atomic_counter_inc:
+         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+                                   fs_reg(), 1, 1, BRW_AOP_INC);
+         break;
+
+      case nir_intrinsic_atomic_counter_dec:
+         tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
+                                   fs_reg(), 1, 1, BRW_AOP_PREDEC);
+         break;
+
+      default:
+         unreachable("Unreachable");
       }
+
+      /* Assign the result. */
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
+
+      /* Mark the surface as used. */
+      brw_mark_surface_used(stage_prog_data, surface);
+      break;
+   }
+
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_atomic_min:
+   case nir_intrinsic_image_atomic_max:
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_image_atomic_comp_swap: {
+      using namespace image_access;
+
+      /* Get the referenced image variable and type. */
+      const nir_variable *var = instr->variables[0]->var;
+      const glsl_type *type = var->type->without_array();
+      const brw_reg_type base_type = get_image_base_type(type);
+
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+      const unsigned arr_dims = type->sampler_array ? 1 : 0;
+      const unsigned surf_dims = type->coordinate_components() - arr_dims;
+      const mesa_format format =
+         (var->data.image.write_only ? MESA_FORMAT_NONE :
+          _mesa_get_shader_image_format(var->data.image.format));
+
+      /* Get the arguments of the image intrinsic. */
+      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg addr = retype(get_nir_src(instr->src[0]),
+                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg src0 = (info->num_srcs >= 3 ?
+                           retype(get_nir_src(instr->src[2]), base_type) :
+                           fs_reg());
+      const fs_reg src1 = (info->num_srcs >= 4 ?
+                           retype(get_nir_src(instr->src[3]), base_type) :
+                           fs_reg());
+      fs_reg tmp;
+
+      /* Emit an image load, store or atomic op. */
+      if (instr->intrinsic == nir_intrinsic_image_load)
+         tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
+
+      else if (instr->intrinsic == nir_intrinsic_image_store)
+         emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, format);
+
+      else
+         tmp = emit_image_atomic(bld, image, addr, src0, src1,
+                                 surf_dims, arr_dims, info->dest_components,
+                                 get_image_atomic_op(instr->intrinsic, type));
+
+      /* Assign the result. */
+      for (unsigned c = 0; c < info->dest_components; ++c)
+         bld.MOV(offset(retype(dest, base_type), bld, c),
+                 offset(tmp, bld, c));
+      break;
+   }
+
+   case nir_intrinsic_memory_barrier: {
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
+      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+         ->regs_written = 2;
       break;
    }
 
@@ -1322,7 +1451,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       assert(sample_pos.file != BAD_FILE);
       dest.type = sample_pos.type;
       bld.MOV(dest, sample_pos);
-      bld.MOV(offset(dest, 1), offset(sample_pos, 1));
+      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
       break;
    }
 
@@ -1349,13 +1478,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       }
 
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = offset(retype(uniform_reg, dest.type), index);
+         fs_reg src = offset(retype(uniform_reg, dest.type), bld, index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
          index++;
 
          bld.MOV(dest, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1387,7 +1516,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[0]),
                  fs_reg(stage_prog_data->binding_table.ubo_start));
-         bld.emit_uniformize(surf_index, surf_index);
+         surf_index = bld.emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -1406,7 +1535,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
          unsigned vec4_offset = instr->const_index[1] / 4;
          for (int i = 0; i < instr->num_components; i++)
-            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, i), surf_index,
+            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
                                        base_offset, vec4_offset + i);
       } else {
          fs_reg packed_consts = vgrf(glsl_type::float_type);
@@ -1425,7 +1554,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
             assert(packed_consts.subreg_offset < 32);
 
             bld.MOV(dest, packed_consts);
-            dest = offset(dest, 1);
+            dest = offset(dest, bld, 1);
          }
       }
       break;
@@ -1437,14 +1566,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_load_input: {
       unsigned index = 0;
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = offset(retype(nir_inputs, dest.type),
+         fs_reg src = offset(retype(nir_inputs, dest.type), bld,
                              instr->const_index[0] + index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
          index++;
 
          bld.MOV(dest, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1470,11 +1599,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_interp_var_at_centroid:
    case nir_intrinsic_interp_var_at_sample:
    case nir_intrinsic_interp_var_at_offset: {
-      /* in SIMD16 mode, the pixel interpolator returns coords interleaved
-       * 8 channels at a time, same as the barycentric coords presented in
-       * the FS payload. this requires a bit of extra work to support.
-       */
-      no16("interpolate_at_* not yet supported in SIMD16 mode.");
+      assert(stage == MESA_SHADER_FRAGMENT);
+
+      ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true;
 
       fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
 
@@ -1517,7 +1644,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                                        BRW_REGISTER_TYPE_F);
             for (int i = 0; i < 2; i++) {
                fs_reg temp = vgrf(glsl_type::float_type);
-               bld.MUL(temp, offset(offset_src, i), fs_reg(16.0f));
+               bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
                fs_reg itemp = vgrf(glsl_type::int_type);
                bld.MOV(itemp, temp);  /* float to int */
 
@@ -1537,10 +1664,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
                 */
                set_condmod(BRW_CONDITIONAL_L,
-                           bld.SEL(offset(src, i), itemp, fs_reg(7)));
+                           bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
             }
 
-            mlen = 2;
+            mlen = 2 * dispatch_width / 8;
             inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
                             fs_reg(0u));
          }
@@ -1552,7 +1679,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       }
 
       inst->mlen = mlen;
-      inst->regs_written = 2; /* 2 floats per slot returned */
+      /* 2 floats per slot returned */
+      inst->regs_written = 2 * dispatch_width / 8;
       inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
                                INTERP_QUALIFIER_NOPERSPECTIVE;
 
@@ -1561,7 +1689,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          src.type = dest.type;
 
          bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
-         dest = offset(dest, 1);
+         dest = offset(dest, bld, 1);
       }
       break;
    }
@@ -1573,13 +1701,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg src = get_nir_src(instr->src[0]);
       unsigned index = 0;
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg new_dest = offset(retype(nir_outputs, src.type),
+         fs_reg new_dest = offset(retype(nir_outputs, src.type), bld,
                                   instr->const_index[0] + index);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
          index++;
          bld.MOV(new_dest, src);
-         src = offset(src, 1);
+         src = offset(src, bld, 1);
       }
       break;
    }
@@ -1689,7 +1817,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
          /* Emit code to evaluate the actual indexing expression */
          sampler_reg = vgrf(glsl_type::uint_type);
          bld.ADD(sampler_reg, src, fs_reg(sampler));
-         bld.emit_uniformize(sampler_reg, sampler_reg);
+         sampler_reg = bld.emit_uniformize(sampler_reg);
          break;
       }
 
@@ -1715,20 +1843,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
       }
    }
 
-   enum glsl_base_type dest_base_type;
-   switch (instr->dest_type) {
-   case nir_type_float:
-      dest_base_type = GLSL_TYPE_FLOAT;
-      break;
-   case nir_type_int:
-      dest_base_type = GLSL_TYPE_INT;
-      break;
-   case nir_type_unsigned:
-      dest_base_type = GLSL_TYPE_UINT;
-      break;
-   default:
-      unreachable("bad type");
-   }
+   enum glsl_base_type dest_base_type =
+     brw_glsl_base_type_for_nir_type (instr->dest_type);
 
    const glsl_type *dest_type =
       glsl_type::get_instance(dest_base_type, nir_tex_instr_dest_size(instr),
@@ -1758,7 +1874,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
    fs_reg dest = get_nir_dest(instr->dest);
    dest.type = this->result.type;
    unsigned num_components = nir_tex_instr_dest_size(instr);
-   emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, dest, this->result),
+   emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
+                             dest, this->result),
                 (1 << num_components) - 1);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
index d92d4bbd81d..b75f40ba5a1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
@@ -24,6 +24,8 @@
 #include "brw_fs.h"
 #include "brw_cfg.h"
 
+using namespace brw;
+
 /** @file brw_fs_peephole_predicated_break.cpp
  *
  * Loops are often structured as
@@ -85,9 +87,9 @@ fs_visitor::opt_peephole_predicated_break()
        * instruction to set the flag register.
        */
       if (devinfo->gen == 6 && if_inst->conditional_mod) {
-         bld.at(if_block, if_inst)
-            .CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
-                 if_inst->conditional_mod);
+         const fs_builder ibld(this, if_block, if_inst);
+         ibld.CMP(ibld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                  if_inst->conditional_mod);
          jump_inst->predicate = BRW_PREDICATE_NORMAL;
       } else {
          jump_inst->predicate = if_inst->predicate;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 364fc4a5ad2..b70895ec2ff 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -73,11 +73,20 @@ fs_visitor::assign_regs_trivial()
 }
 
 static void
-brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
+brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
 {
    const struct brw_device_info *devinfo = compiler->devinfo;
    int base_reg_count = BRW_MAX_GRF;
-   int index = reg_width - 1;
+   int index = (dispatch_width / 8) - 1;
+
+   if (dispatch_width > 8 && devinfo->gen >= 7) {
+      /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
+       * SIMD16.  Therefore, we can use the exact same register sets for
+       * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
+       */
+      compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
+      return;
+   }
 
    /* The registers used to make up almost all values handled in the compiler
     * are a scalar value occupying a single register (or 2 registers in the
@@ -121,7 +130,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    /* Compute the total number of registers across all classes. */
    int ra_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          /* From the G45 PRM:
           *
           * In order to reduce the hardware complexity, the following
@@ -168,7 +177,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    int pairs_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
       int class_reg_count;
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
 
          /* See comment below.  The only difference here is that we are
@@ -214,7 +223,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
          pairs_reg_count = class_reg_count;
       }
 
-      if (devinfo->gen <= 5 && reg_width == 2) {
+      if (devinfo->gen <= 5 && dispatch_width == 16) {
          for (int j = 0; j < class_reg_count; j++) {
             ra_class_add_reg(regs, classes[i], reg);
 
@@ -249,7 +258,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
    /* Add a special class for aligned pairs, which we'll put delta_xy
     * in on Gen <= 6 so that we can do PLN.
     */
-   if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) {
+   if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) {
       aligned_pairs_class = ra_alloc_reg_class(regs);
 
       for (int i = 0; i < pairs_reg_count; i++) {
@@ -287,8 +296,8 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width)
 void
 brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
 {
-   brw_alloc_reg_set(compiler, 1);
-   brw_alloc_reg_set(compiler, 2);
+   brw_alloc_reg_set(compiler, 8);
+   brw_alloc_reg_set(compiler, 16);
 }
 
 static int
@@ -341,7 +350,9 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
    int loop_end_ip = 0;
 
    int payload_last_use_ip[payload_node_count];
-   memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip));
+   for (int i = 0; i < payload_node_count; i++)
+      payload_last_use_ip[i] = -1;
+
    int ip = 0;
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       switch (inst->opcode) {
@@ -380,32 +391,15 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
             if (node_nr >= payload_node_count)
                continue;
 
-            payload_last_use_ip[node_nr] = use_ip;
+            for (int j = 0; j < inst->regs_read(i); j++) {
+               payload_last_use_ip[node_nr + j] = use_ip;
+               assert(node_nr + j < payload_node_count);
+            }
          }
       }
 
       /* Special case instructions which have extra implied registers used. */
       switch (inst->opcode) {
-      case FS_OPCODE_LINTERP:
-         /* On gen6+ in SIMD16, there are 4 adjacent registers used by
-          * PLN's sourcing of the deltas, while we list only the first one
-          * in the arguments.  Pre-gen6, the deltas are computed in normal
-          * VGRFs.
-          */
-         if (devinfo->gen >= 6) {
-            int delta_x_arg = 0;
-            if (inst->src[delta_x_arg].file == HW_REG &&
-                inst->src[delta_x_arg].fixed_hw_reg.file ==
-                BRW_GENERAL_REGISTER_FILE) {
-               for (int i = 1; i < 4; ++i) {
-                  int node = inst->src[delta_x_arg].fixed_hw_reg.nr + i;
-                  assert(node < payload_node_count);
-                  payload_last_use_ip[node] = use_ip;
-               }
-            }
-         }
-         break;
-
       case CS_OPCODE_CS_TERMINATE:
          payload_last_use_ip[0] = use_ip;
          break;
@@ -428,6 +422,9 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
    }
 
    for (int i = 0; i < payload_node_count; i++) {
+      if (payload_last_use_ip[i] == -1)
+         continue;
+
       /* Mark the payload node as interfering with any virtual grf that is
        * live between the start of the program and our last use of the payload
        * node.
@@ -706,10 +703,8 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
                          uint32_t spill_offset, int count)
 {
    int reg_size = 1;
-   if (dispatch_width == 16 && count % 2 == 0) {
+   if (dispatch_width == 16 && count % 2 == 0)
       reg_size = 2;
-      dst.width = 16;
-   }
 
    const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
                               .group(reg_size * 8, 0)
@@ -752,7 +747,7 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
 
    for (int i = 0; i < count / reg_size; i++) {
       fs_inst *spill_inst =
-         ibld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src);
+         ibld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, ibld.null_reg_f(), src);
       src.reg_offset += reg_size;
       spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
       spill_inst->mlen = 1 + reg_size; /* header, value */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 2ad7079bdf8..72e873857ce 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -167,7 +167,6 @@ fs_visitor::register_coalesce()
          src_size = alloc.sizes[inst->src[0].reg];
          assert(src_size <= MAX_VGRF_SIZE);
 
-         assert(inst->src[0].width % 8 == 0);
          channels_remaining = src_size;
          memset(mov, 0, sizeof(mov));
 
@@ -196,7 +195,7 @@ fs_visitor::register_coalesce()
             continue;
          }
          reg_to_offset[offset] = inst->dst.reg_offset;
-         if (inst->src[0].width == 16)
+         if (inst->regs_written > 1)
             reg_to_offset[offset + 1] = inst->dst.reg_offset + 1;
          mov[offset] = inst;
          channels_remaining -= inst->regs_written;
@@ -229,7 +228,6 @@ fs_visitor::register_coalesce()
          continue;
 
       progress = true;
-      bool was_load_payload = inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD;
 
       for (int i = 0; i < src_size; i++) {
          if (mov[i]) {
@@ -243,22 +241,19 @@ fs_visitor::register_coalesce()
       }
 
       foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
-         for (int i = 0; i < src_size; i++) {
-            if (mov[i] || was_load_payload) {
-               if (scan_inst->dst.file == GRF &&
-                   scan_inst->dst.reg == reg_from &&
-                   scan_inst->dst.reg_offset == i) {
-                  scan_inst->dst.reg = reg_to;
-                  scan_inst->dst.reg_offset = reg_to_offset[i];
-               }
-               for (int j = 0; j < scan_inst->sources; j++) {
-                  if (scan_inst->src[j].file == GRF &&
-                      scan_inst->src[j].reg == reg_from &&
-                      scan_inst->src[j].reg_offset == i) {
-                     scan_inst->src[j].reg = reg_to;
-                     scan_inst->src[j].reg_offset = reg_to_offset[i];
-                  }
-               }
+         if (scan_inst->dst.file == GRF &&
+             scan_inst->dst.reg == reg_from) {
+            scan_inst->dst.reg = reg_to;
+            scan_inst->dst.reg_offset =
+               reg_to_offset[scan_inst->dst.reg_offset];
+         }
+
+         for (int j = 0; j < scan_inst->sources; j++) {
+            if (scan_inst->src[j].file == GRF &&
+                scan_inst->src[j].reg == reg_from) {
+               scan_inst->src[j].reg = reg_to;
+               scan_inst->src[j].reg_offset =
+                  reg_to_offset[scan_inst->src[j].reg_offset];
             }
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
index 8660ec08b8f..d190d8eb6b4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
@@ -174,6 +174,9 @@ fs_visitor::opt_peephole_sel()
 
          /* Check that the MOVs are the right form. */
          if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
+             then_mov[i]->exec_size != else_mov[i]->exec_size ||
+             then_mov[i]->force_sechalf != else_mov[i]->force_sechalf ||
+             then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
              then_mov[i]->is_partial_write() ||
              else_mov[i]->is_partial_write() ||
              then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
@@ -192,14 +195,17 @@ fs_visitor::opt_peephole_sel()
       if (movs == 0)
          continue;
 
-      const fs_builder ibld = bld.at(block, if_inst);
-
       /* Emit a CMP if our IF used the embedded comparison */
-      if (devinfo->gen == 6 && if_inst->conditional_mod)
+      if (devinfo->gen == 6 && if_inst->conditional_mod) {
+         const fs_builder ibld(this, block, if_inst);
          ibld.CMP(ibld.null_reg_d(), if_inst->src[0], if_inst->src[1],
                   if_inst->conditional_mod);
+      }
 
       for (int i = 0; i < movs; i++) {
+         const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
+                                 .at(block, if_inst);
+
          if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
             ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
          } else {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
new file mode 100644
index 00000000000..50e0acd05f5
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -0,0 +1,1096 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs_surface_builder.h"
+#include "brw_fs.h"
+
+using namespace brw;
+
+namespace brw {
+   namespace surface_access {
+      namespace {
+         /**
+          * Generate a logical send opcode for a surface message and return
+          * the result.
+          */
+         fs_reg
+         emit_send(const fs_builder &bld, enum opcode opcode,
+                   const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
+                   unsigned dims, unsigned arg, unsigned rsize,
+                   brw_predicate pred = BRW_PREDICATE_NONE)
+         {
+            /* Reduce the dynamically uniform surface index to a single
+             * scalar.
+             */
+            const fs_reg usurface = bld.emit_uniformize(surface);
+            const fs_reg srcs[] = {
+               addr, src, usurface, fs_reg(dims), fs_reg(arg)
+            };
+            const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
+            fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+
+            inst->regs_written = rsize * bld.dispatch_width() / 8;
+            inst->predicate = pred;
+            return dst;
+         }
+      }
+
+      /**
+       * Emit an untyped surface read opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the returned value.
+       */
+      fs_reg
+      emit_untyped_read(const fs_builder &bld,
+                        const fs_reg &surface, const fs_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred)
+      {
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                          addr, fs_reg(), surface, dims, size, size, pred);
+      }
+
+      /**
+       * Emit an untyped surface write opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the argument.
+       */
+      void
+      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+                         const fs_reg &addr, const fs_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred)
+      {
+         emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+                   addr, src, surface, dims, size, 0, pred);
+      }
+
+      /**
+       * Emit an untyped surface atomic opcode.  \p dims determines the number
+       * of components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      fs_reg
+      emit_untyped_atomic(const fs_builder &bld,
+                          const fs_reg &surface, const fs_reg &addr,
+                          const fs_reg &src0, const fs_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred)
+      {
+         /* FINISHME: Factor out this frequently recurring pattern into a
+          * helper function.
+          */
+         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const fs_reg srcs[] = { src0, src1 };
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+                          addr, tmp, surface, dims, op, rsize, pred);
+      }
+
+      /**
+       * Emit a typed surface read opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * returned value.
+       */
+      fs_reg
+      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+                      const fs_reg &addr, unsigned dims, unsigned size)
+      {
+         return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+                          addr, fs_reg(), surface, dims, size, size);
+      }
+
+      /**
+       * Emit a typed surface write opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * argument.
+       */
+      void
+      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned dims, unsigned size)
+      {
+         emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+                   addr, src, surface, dims, size, 0);
+      }
+
+      /**
+       * Emit a typed surface atomic opcode.  \p dims determines the number of
+       * components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      fs_reg
+      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+                        const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred)
+      {
+         /* FINISHME: Factor out this frequently recurring pattern into a
+          * helper function.
+          */
+         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const fs_reg srcs[] = { src0, src1 };
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+         return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+                          addr, tmp, surface, dims, op, rsize);
+      }
+   }
+}
+
+namespace {
+   namespace image_format_info {
+      /**
+       * Simple 4-tuple of scalars used to pass around per-color component
+       * values.
+       */
+      struct color_u {
+         color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
+         {
+         }
+
+         color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
+            r(r), g(g), b(b), a(a)
+         {
+         }
+
+         unsigned
+         operator[](unsigned i) const
+         {
+            const unsigned xs[] = { r, g, b, a };
+            return xs[i];
+         }
+
+         unsigned r, g, b, a;
+      };
+
+      /**
+       * Return the per-channel bitfield widths for a given image format.
+       */
+      inline color_u
+      get_bit_widths(mesa_format format)
+      {
+         return color_u(_mesa_get_format_bits(format, GL_RED_BITS),
+                        _mesa_get_format_bits(format, GL_GREEN_BITS),
+                        _mesa_get_format_bits(format, GL_BLUE_BITS),
+                        _mesa_get_format_bits(format, GL_ALPHA_BITS));
+      }
+
+      /**
+       * Return the per-channel bitfield shifts for a given image format.
+       */
+      inline color_u
+      get_bit_shifts(mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         return color_u(0, widths.r, widths.r + widths.g,
+                        widths.r + widths.g + widths.b);
+      }
+
+      /**
+       * Return true if all present components have the same bit width.
+       */
+      inline bool
+      is_homogeneous(mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         return ((widths.g == 0 || widths.g == widths.r) &&
+                 (widths.b == 0 || widths.b == widths.r) &&
+                 (widths.a == 0 || widths.a == widths.r));
+      }
+
+      /**
+       * Return true if the format conversion boils down to a trivial copy.
+       */
+      inline bool
+      is_conversion_trivial(const brw_device_info *devinfo, mesa_format format)
+      {
+         return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
+                 format == brw_lower_mesa_image_format(devinfo, format);
+      }
+
+      /**
+       * Return true if the hardware natively supports some format with
+       * compatible bitfield layout, but possibly different data types.
+       */
+      inline bool
+      has_supported_bit_layout(const brw_device_info *devinfo,
+                               mesa_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         const color_u lower_widths = get_bit_widths(
+            brw_lower_mesa_image_format(devinfo, format));
+
+         return (widths.r == lower_widths.r &&
+                 widths.g == lower_widths.g &&
+                 widths.b == lower_widths.b &&
+                 widths.a == lower_widths.a);
+      }
+
+      /**
+       * Return true if we are required to spread individual components over
+       * several components of the format used by the hardware (RG32 and
+       * friends implemented as RGBA16UI).
+       */
+      inline bool
+      has_split_bit_layout(const brw_device_info *devinfo, mesa_format format)
+      {
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+
+         return (_mesa_format_num_components(format) <
+                 _mesa_format_num_components(lower_format));
+      }
+
+      /**
+       * Return true unless we have to fall back to untyped surface access.
+       * Fail!
+       */
+      inline bool
+      has_matching_typed_format(const brw_device_info *devinfo,
+                                mesa_format format)
+      {
+         return (_mesa_get_format_bytes(format) <= 4 ||
+                 (_mesa_get_format_bytes(format) <= 8 &&
+                  (devinfo->gen >= 8 || devinfo->is_haswell)) ||
+                 devinfo->gen >= 9);
+      }
+
+      /**
+       * Return true if the hardware returns garbage in the unused high bits
+       * of each component.  This may happen on IVB because we rely on the
+       * undocumented behavior that typed reads from surfaces of the
+       * unsupported R8 and R16 formats return useful data in their least
+       * significant bits.
+       */
+      inline bool
+      has_undefined_high_bits(const brw_device_info *devinfo,
+                              mesa_format format)
+      {
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+
+         return (devinfo->gen == 7 && !devinfo->is_haswell &&
+                 (lower_format == MESA_FORMAT_R_UINT16 ||
+                  lower_format == MESA_FORMAT_R_UINT8));
+      }
+
+      /**
+       * Return true if the format represents values as signed integers
+       * requiring sign extension when unpacking.
+       */
+      inline bool
+      needs_sign_extension(mesa_format format)
+      {
+         return (_mesa_get_format_datatype(format) == GL_SIGNED_NORMALIZED ||
+                 _mesa_get_format_datatype(format) == GL_INT);
+      }
+   }
+
+   namespace image_validity {
+      /**
+       * Check whether there is an image bound at the given index and write
+       * the comparison result to f0.0.  Returns an appropriate predication
+       * mode to use on subsequent image operations.
+       */
+      brw_predicate
+      emit_surface_check(const fs_builder &bld, const fs_reg &image)
+      {
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+         if (devinfo->gen == 7 && !devinfo->is_haswell) {
+            /* Check the first component of the size field to find out if the
+             * image is bound.  Necessary on IVB for typed atomics because
+             * they don't seem to respect null surfaces and will happily
+             * corrupt or read random memory when no image is bound.
+             */
+            bld.CMP(bld.null_reg_ud(),
+                    retype(size, BRW_REGISTER_TYPE_UD),
+                    fs_reg(0), BRW_CONDITIONAL_NZ);
+
+            return BRW_PREDICATE_NORMAL;
+         } else {
+            /* More recent platforms implement compliant behavior when a null
+             * surface is bound.
+             */
+            return BRW_PREDICATE_NONE;
+         }
+      }
+
+      /**
+       * Check whether the provided coordinates are within the image bounds
+       * and write the comparison result to f0.0.  Returns an appropriate
+       * predication mode to use on subsequent image operations.
+       */
+      brw_predicate
+      emit_bounds_check(const fs_builder &bld, const fs_reg &image,
+                        const fs_reg &addr, unsigned dims)
+      {
+         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+         for (unsigned c = 0; c < dims; ++c)
+            set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
+                          bld.CMP(bld.null_reg_ud(),
+                                  offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
+                                  offset(size, bld, c),
+                                  BRW_CONDITIONAL_L));
+
+         return BRW_PREDICATE_NORMAL;
+      }
+   }
+
+   namespace image_coordinates {
+      /**
+       * Return the total number of coordinates needed to address a texel of
+       * the surface, which may be more than the sum of \p surf_dims and \p
+       * arr_dims if padding is required.
+       */
+      unsigned
+      num_image_coordinates(const fs_builder &bld,
+                            unsigned surf_dims, unsigned arr_dims,
+                            mesa_format format)
+      {
+         /* HSW in vec4 mode and our software coordinate handling for untyped
+          * reads want the array index to be at the Z component.
+          */
+         const bool array_index_at_z =
+            !image_format_info::has_matching_typed_format(
+               bld.shader->devinfo, format);
+         const unsigned zero_dims =
+            ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
+
+         return surf_dims + zero_dims + arr_dims;
+      }
+
+      /**
+       * Transform image coordinates into the form expected by the
+       * implementation.
+       */
+      fs_reg
+      emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
+                             unsigned surf_dims, unsigned arr_dims,
+                             mesa_format format)
+      {
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (dims > surf_dims + arr_dims) {
+            assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
+            /* The array index is required to be passed in as the Z component,
+             * insert a zero at the Y component to shift it to the right
+             * position.
+             *
+             * FINISHME: Factor out this frequently recurring pattern into a
+             * helper function.
+             */
+            const fs_reg srcs[] = { addr, fs_reg(0), offset(addr, bld, 1) };
+            const fs_reg dst = bld.vgrf(addr.type, dims);
+            bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
+            return dst;
+         } else {
+            return addr;
+         }
+      }
+
+      /**
+       * Calculate the offset in memory of the texel given by \p coord.
+       *
+       * This is meant to be used with untyped surface messages to access a
+       * tiled surface, what involves taking into account the tiling and
+       * swizzling modes of the surface manually so it will hopefully not
+       * happen very often.
+       *
+       * The tiling algorithm implemented here matches either the X or Y
+       * tiling layouts supported by the hardware depending on the tiling
+       * coefficients passed to the program as uniforms.  See Volume 1 Part 2
+       * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
+       * explanation of the hardware tiling format.
+       */
+      fs_reg
+      emit_address_calculation(const fs_builder &bld, const fs_reg &image,
+                               const fs_reg &coord, unsigned dims)
+      {
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
+         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
+         const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
+         const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
+         const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         /* Shift the coordinates by the fixed surface offset.  It may be
+          * non-zero if the image is a single slice of a higher-dimensional
+          * surface, or if a non-zero mipmap level of the surface is bound to
+          * the pipeline.  The offset needs to be applied here rather than at
+          * surface state set-up time because the desired slice-level may
+          * start mid-tile, so simply shifting the surface base address
+          * wouldn't give a well-formed tiled surface in the general case.
+          */
+         for (unsigned c = 0; c < 2; ++c)
+            bld.ADD(offset(addr, bld, c), offset(off, bld, c),
+                    (c < dims ?
+                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
+                     fs_reg(0)));
+
+         /* The layout of 3-D textures in memory is sort-of like a tiling
+          * format.  At each miplevel, the slices are arranged in rows of
+          * 2^level slices per row.  The slice row is stored in tmp.y and
+          * the slice within the row is stored in tmp.x.
+          *
+          * The layout of 2-D array textures and cubemaps is much simpler:
+          * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+          * stored in memory as an array of slices, each one being a 2-D
+          * arrangement of miplevels, or as a 2D arrangement of miplevels,
+          * each one being an array of slices.  In either case the separation
+          * between slices of the same LOD is equal to the qpitch value
+          * provided as stride.w.
+          *
+          * This code can be made to handle either 2D arrays and 3D textures
+          * by passing in the miplevel as tile.z for 3-D textures and 0 in
+          * tile.z for 2-D array textures.
+          *
+          * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
+          * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+          * of the hardware 3D texture and 2D array layouts.
+          */
+         if (dims > 2) {
+            /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+             * index.
+             */
+            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), fs_reg(0),
+                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
+            bld.SHR(offset(tmp, bld, 1),
+                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
+                    offset(tile, bld, 2));
+
+            /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+             * slice offset.
+             */
+            for (unsigned c = 0; c < 2; ++c) {
+               bld.MUL(offset(tmp, bld, c),
+                       offset(stride, bld, 2 + c), offset(tmp, bld, c));
+               bld.ADD(offset(addr, bld, c),
+                       offset(addr, bld, c), offset(tmp, bld, c));
+            }
+         }
+
+         if (dims > 1) {
+            /* Calculate the major/minor x and y indices.  In order to
+             * accommodate both X and Y tiling, the Y-major tiling format is
+             * treated as being a bunch of narrow X-tiles placed next to each
+             * other.  This means that the tile width for Y-tiling is actually
+             * the width of one sub-column of the Y-major tile where each 4K
+             * tile has 8 512B sub-columns.
+             *
+             * The major Y value is the row of tiles in which the pixel lives.
+             * The major X value is the tile sub-column in which the pixel
+             * lives; for X tiling, this is the same as the tile column, for Y
+             * tiling, each tile has 8 sub-columns.  The minor X and Y indices
+             * are the position within the sub-column.
+             */
+            for (unsigned c = 0; c < 2; ++c) {
+               /* Calculate the minor x and y indices. */
+               bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
+                       fs_reg(0), offset(addr, bld, c));
+
+               /* Calculate the major x and y indices. */
+               bld.SHR(offset(major, bld, c),
+                       offset(addr, bld, c), offset(tile, bld, c));
+            }
+
+            /* Calculate the texel index from the start of the tile row and
+             * the vertical coordinate of the row.
+             * Equivalent to:
+             *   tmp.x = (major.x << tile.y << tile.x) +
+             *           (minor.y << tile.x) + minor.x
+             *   tmp.y = major.y << tile.y
+             */
+            bld.SHL(tmp, major, offset(tile, bld, 1));
+            bld.ADD(tmp, tmp, offset(minor, bld, 1));
+            bld.SHL(tmp, tmp, offset(tile, bld, 0));
+            bld.ADD(tmp, tmp, minor);
+            bld.SHL(offset(tmp, bld, 1),
+                    offset(major, bld, 1), offset(tile, bld, 1));
+
+            /* Add it to the start of the tile row. */
+            bld.MUL(offset(tmp, bld, 1),
+                    offset(tmp, bld, 1), offset(stride, bld, 1));
+            bld.ADD(tmp, tmp, offset(tmp, bld, 1));
+
+            /* Multiply by the Bpp value. */
+            bld.MUL(dst, tmp, stride);
+
+            if (devinfo->gen < 8 && !devinfo->is_baytrail) {
+               /* Take into account the two dynamically specified shifts.
+                * Both need are used to implement swizzling of X-tiled
+                * surfaces.  For Y-tiled surfaces only one bit needs to be
+                * XOR-ed with bit 6 of the memory address, so a swz value of
+                * 0xff (actually interpreted as 31 by the hardware) will be
+                * provided to cause the relevant bit of tmp.y to be zero and
+                * turn the first XOR into the identity.  For linear surfaces
+                * or platforms lacking address swizzling both shifts will be
+                * 0xff causing the relevant bits of both tmp.x and .y to be
+                * zero, what effectively disables swizzling.
+                */
+               for (unsigned c = 0; c < 2; ++c)
+                  bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
+
+               /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+               bld.XOR(tmp, tmp, offset(tmp, bld, 1));
+               bld.AND(tmp, tmp, fs_reg(1 << 6));
+               bld.XOR(dst, dst, tmp);
+            }
+
+         } else {
+            /* Multiply by the Bpp/stride value.  Note that the addr.y may be
+             * non-zero even if the image is one-dimensional because a
+             * vertical offset may have been applied above to select a
+             * non-zero slice or level of a higher-dimensional texture.
+             */
+            bld.MUL(offset(addr, bld, 1),
+                    offset(addr, bld, 1), offset(stride, bld, 1));
+            bld.ADD(addr, addr, offset(addr, bld, 1));
+            bld.MUL(dst, addr, stride);
+         }
+
+         return dst;
+      }
+   }
+
+   namespace image_format_conversion {
+      using image_format_info::color_u;
+
+      namespace {
+         /**
+          * Maximum representable value in an unsigned integer with the given
+          * number of bits.
+          */
+         inline unsigned
+         scale(unsigned n)
+         {
+            return (1 << n) - 1;
+         }
+      }
+
+      /**
+       * Pack the vector \p src in a bitfield given the per-component bit
+       * shifts and widths.  Note that bitfield components are not allowed to
+       * cross 32-bit boundaries.
+       */
+      fs_reg
+      emit_pack(const fs_builder &bld, const fs_reg &src,
+                const color_u &shifts, const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         bool seen[4] = {};
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+               /* Shift each component left to the correct bitfield position. */
+               bld.SHL(tmp, offset(src, bld, c), fs_reg(shifts[c] % 32));
+
+               /* Add everything up. */
+               if (seen[shifts[c] / 32]) {
+                  bld.OR(offset(dst, bld, shifts[c] / 32),
+                         offset(dst, bld, shifts[c] / 32), tmp);
+               } else {
+                  bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
+                  seen[shifts[c] / 32] = true;
+               }
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Unpack a vector from the bitfield \p src given the per-component bit
+       * shifts and widths.  Note that bitfield components are not allowed to
+       * cross 32-bit boundaries.
+       */
+      fs_reg
+      emit_unpack(const fs_builder &bld, const fs_reg &src,
+                  const color_u &shifts, const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(src.type, 4);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Shift left to discard the most significant bits. */
+               bld.SHL(offset(dst, bld, c),
+                       offset(src, bld, shifts[c] / 32),
+                       fs_reg(32 - shifts[c] % 32 - widths[c]));
+
+               /* Shift back to the least significant bits using an arithmetic
+                * shift to get sign extension on signed types.
+                */
+               bld.ASR(offset(dst, bld, c),
+                       offset(dst, bld, c), fs_reg(32 - widths[c]));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert an integer vector into another integer vector of the
+       * specified bit widths, properly handling overflow.
+       */
+      fs_reg
+      emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
+                              const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(
+            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+         assert(src.type == dst.type);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Clamp to the maximum value. */
+               bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
+                               fs_reg((int)scale(widths[c] - s)),
+                               BRW_CONDITIONAL_L);
+
+               /* Clamp to the minimum value. */
+               if (is_signed)
+                  bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
+                                  fs_reg(-(int)scale(widths[c] - s) - 1),
+                                  BRW_CONDITIONAL_G);
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert a normalized fixed-point vector of the specified signedness
+       * and bit widths into a floating point vector.
+       */
+      fs_reg
+      emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
+                               const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Convert to float. */
+               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+               /* Divide by the normalization constants. */
+               bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
+                       fs_reg(1.0f / scale(widths[c] - s)));
+
+               /* Clamp to the minimum value. */
+               if (is_signed)
+                  bld.emit_minmax(offset(dst, bld, c),
+                                  offset(dst, bld, c), fs_reg(-1.0f),
+                                  BRW_CONDITIONAL_G);
+            }
+         }
+         return dst;
+      }
+
+      /**
+       * Convert a floating-point vector into a normalized fixed-point vector
+       * of the specified signedness and bit widths.
+       */
+      fs_reg
+      emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
+                             const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(
+            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Clamp the normalized floating-point argument. */
+               if (is_signed) {
+                  bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
+                                  fs_reg(-1.0f), BRW_CONDITIONAL_G);
+
+                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+                                  fs_reg(1.0f), BRW_CONDITIONAL_L);
+               } else {
+                  set_saturate(true, bld.MOV(offset(fdst, bld, c),
+                                             offset(src, bld, c)));
+               }
+
+               /* Multiply by the normalization constants. */
+               bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
+                       fs_reg((float)scale(widths[c] - s)));
+
+               /* Convert to integer. */
+               bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
+               bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert a floating point vector of the specified bit widths into a
+       * 32-bit floating point vector.
+       */
+      fs_reg
+      emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
+                              const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+               /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
+                * This works because they have a 5-bit exponent just like the
+                * 16-bit floating point format, and they have no sign bit.
+                */
+               if (widths[c] < 16)
+                  bld.SHL(offset(dst, bld, c),
+                          offset(dst, bld, c), fs_reg(15 - widths[c]));
+
+               /* Convert to 32-bit floating point. */
+               bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
+            }
+         }
+
+         return fdst;
+      }
+
+      /**
+       * Convert a vector into a floating point vector of the specified bit
+       * widths.
+       */
+      fs_reg
+      emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
+                            const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
+
+               /* Clamp to the minimum value. */
+               if (widths[c] < 16)
+                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+                                  fs_reg(0.0f), BRW_CONDITIONAL_G);
+
+               /* Convert to 16-bit floating-point. */
+               bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
+
+               /* Discard the least significant bits to get floating point
+                * numbers of the requested width.  This works because the
+                * 10-bit and 11-bit floating point formats have a 5-bit
+                * exponent just like the 16-bit format, and they have no sign
+                * bit.
+                */
+               if (widths[c] < 16)
+                  bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
+                          fs_reg(15 - widths[c]));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Fill missing components of a vector with 0, 0, 0, 1.
+       */
+      fs_reg
+      emit_pad(const fs_builder &bld, const fs_reg &src,
+               const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(src.type, 4);
+         const unsigned pad[] = { 0, 0, 0, 1 };
+
+         for (unsigned c = 0; c < 4; ++c)
+            bld.MOV(offset(dst, bld, c),
+                    widths[c] ? offset(src, bld, c) : fs_reg(pad[c]));
+
+         return dst;
+      }
+   }
+}
+
+namespace brw {
+   namespace image_access {
+      /**
+       * Load a vector from a surface of the given format and dimensionality
+       * at the given coordinates.  \p surf_dims and \p arr_dims give the
+       * number of non-array and array coordinates of the image respectively.
+       */
+      fs_reg
+      emit_image_load(const fs_builder &bld,
+                      const fs_reg &image, const fs_reg &addr,
+                      unsigned surf_dims, unsigned arr_dims,
+                      mesa_format format)
+      {
+         using namespace image_format_info;
+         using namespace image_format_conversion;
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         const brw_device_info *devinfo = bld.shader->devinfo;
+         const mesa_format lower_format =
+            brw_lower_mesa_image_format(devinfo, format);
+         fs_reg tmp;
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (has_matching_typed_format(devinfo, format)) {
+            /* Hopefully we get here most of the time... */
+            tmp = emit_typed_read(bld, image, saddr, dims,
+                                  _mesa_format_num_components(lower_format));
+         } else {
+            /* Untyped surface reads return 32 bits of the surface per
+             * component, without any sort of unpacking or type conversion,
+             */
+            const unsigned size = _mesa_get_format_bytes(format) / 4;
+
+            /* they don't properly handle out of bounds access, so we have to
+             * check manually if the coordinates are valid and predicate the
+             * surface read on the result,
+             */
+            const brw_predicate pred =
+               emit_bounds_check(bld, image, saddr, dims);
+
+            /* and they don't know about surface coordinates, we need to
+             * convert them to a raw memory offset.
+             */
+            const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
+
+            tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
+
+            /* An out of bounds surface access should give zero as result. */
+            for (unsigned c = 0; c < 4; ++c)
+               set_predicate(pred, bld.SEL(offset(tmp, bld, c),
+                                           offset(tmp, bld, c), fs_reg(0)));
+         }
+
+         /* Set the register type to D instead of UD if the data type is
+          * represented as a signed integer in memory so that sign extension
+          * is handled correctly by unpack.
+          */
+         if (needs_sign_extension(format))
+            tmp = retype(tmp, BRW_REGISTER_TYPE_D);
+
+         if (!has_supported_bit_layout(devinfo, format)) {
+            /* Unpack individual vector components from the bitfield if the
+             * hardware is unable to do it for us.
+             */
+            if (has_split_bit_layout(devinfo, format))
+               tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
+                               get_bit_widths(lower_format));
+            else
+               tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
+                                 get_bit_widths(format));
+
+         } else if ((needs_sign_extension(format) &&
+                     !is_conversion_trivial(devinfo, format)) ||
+                    has_undefined_high_bits(devinfo, format)) {
+            /* Perform a trivial unpack even though the bit layout matches in
+             * order to get the most significant bits of each component
+             * initialized properly.
+             */
+            tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
+                              get_bit_widths(format));
+         }
+
+         if (!_mesa_is_format_integer(format)) {
+            if (is_conversion_trivial(devinfo, format)) {
+               /* Just need to cast the vector to the target type. */
+               tmp = retype(tmp, BRW_REGISTER_TYPE_F);
+            } else {
+               /* Do the right sort of type conversion to float. */
+               if (_mesa_get_format_datatype(format) == GL_FLOAT)
+                  tmp = emit_convert_from_float(
+                     bld, tmp, get_bit_widths(format));
+               else
+                  tmp = emit_convert_from_scaled(
+                     bld, tmp, get_bit_widths(format),
+                     _mesa_is_format_signed(format));
+            }
+         }
+
+         /* Initialize missing components of the result. */
+         return emit_pad(bld, tmp, get_bit_widths(format));
+      }
+
+      /**
+       * Store a vector in a surface of the given format and dimensionality at
+       * the given coordinates.  \p surf_dims and \p arr_dims give the number
+       * of non-array and array coordinates of the image respectively.
+       */
+      void
+      emit_image_store(const fs_builder &bld, const fs_reg &image,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned surf_dims, unsigned arr_dims,
+                       mesa_format format)
+      {
+         using namespace image_format_info;
+         using namespace image_format_conversion;
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         const brw_device_info *devinfo = bld.shader->devinfo;
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (format == MESA_FORMAT_NONE) {
+            /* We don't know what the format is, but that's fine because it
+             * implies write-only access, and typed surface writes are always
+             * able to take care of type conversion and packing for us.
+             */
+            emit_typed_write(bld, image, saddr, src, dims, 4);
+
+         } else {
+            const mesa_format lower_format =
+               brw_lower_mesa_image_format(devinfo, format);
+            fs_reg tmp = src;
+
+            if (!is_conversion_trivial(devinfo, format)) {
+               /* Do the right sort of type conversion. */
+               if (_mesa_get_format_datatype(format) == GL_FLOAT)
+                  tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
+
+               else if (_mesa_is_format_integer(format))
+                  tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
+                                                _mesa_is_format_signed(format));
+
+               else
+                  tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
+                                               _mesa_is_format_signed(format));
+            }
+
+            /* We're down to bit manipulation at this point. */
+            tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
+
+            if (!has_supported_bit_layout(devinfo, format)) {
+               /* Pack the vector components into a bitfield if the hardware
+                * is unable to do it for us.
+                */
+               if (has_split_bit_layout(devinfo, format))
+                  tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
+                                    get_bit_widths(lower_format));
+
+               else
+                  tmp = emit_pack(bld, tmp, get_bit_shifts(format),
+                                  get_bit_widths(format));
+            }
+
+            if (has_matching_typed_format(devinfo, format)) {
+               /* Hopefully we get here most of the time... */
+               emit_typed_write(bld, image, saddr, tmp, dims,
+                                _mesa_format_num_components(lower_format));
+
+            } else {
+               /* Untyped surface writes store 32 bits of the surface per
+                * component, without any sort of packing or type conversion,
+                */
+               const unsigned size = _mesa_get_format_bytes(format) / 4;
+
+               /* they don't properly handle out of bounds access, so we have
+                * to check manually if the coordinates are valid and predicate
+                * the surface write on the result,
+                */
+               const brw_predicate pred =
+                  emit_bounds_check(bld, image, saddr, dims);
+
+               /* and, phew, they don't know about surface coordinates, we
+                * need to convert them to a raw memory offset.
+                */
+               const fs_reg laddr = emit_address_calculation(
+                  bld, image, saddr, dims);
+
+               emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
+            }
+         }
+      }
+
+      /**
+       * Perform an atomic read-modify-write operation in a surface of the
+       * given dimensionality at the given coordinates.  \p surf_dims and \p
+       * arr_dims give the number of non-array and array coordinates of the
+       * image respectively.  Main building block of the imageAtomic GLSL
+       * built-ins.
+       */
+      fs_reg
+      emit_image_atomic(const fs_builder &bld,
+                        const fs_reg &image, const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned surf_dims, unsigned arr_dims,
+                        unsigned rsize, unsigned op)
+      {
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         /* Avoid performing an atomic operation on an unbound surface. */
+         const brw_predicate pred = emit_surface_check(bld, image);
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims,
+                                  MESA_FORMAT_R_UINT32);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims,
+                                  MESA_FORMAT_R_UINT32);
+
+         /* Thankfully we can do without untyped atomics here. */
+         const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
+                                              dims, rsize, op, pred);
+
+         /* An unbound surface access should give zero as result. */
+         if (rsize)
+            set_predicate(pred, bld.SEL(tmp, tmp, fs_reg(0)));
+
+         return tmp;
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
new file mode 100644
index 00000000000..a3dd839955b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.h
@@ -0,0 +1,89 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_SURFACE_BUILDER_H
+#define BRW_FS_SURFACE_BUILDER_H
+
+#include "brw_fs_builder.h"
+#include "brw_context.h"
+
+namespace brw {
+   namespace surface_access {
+      fs_reg
+      emit_untyped_read(const fs_builder &bld,
+                        const fs_reg &surface, const fs_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+
+      void
+      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+                         const fs_reg &addr, const fs_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred = BRW_PREDICATE_NONE);
+
+      fs_reg
+      emit_untyped_atomic(const fs_builder &bld,
+                          const fs_reg &surface, const fs_reg &addr,
+                          const fs_reg &src0, const fs_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred = BRW_PREDICATE_NONE);
+
+      fs_reg
+      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+                      const fs_reg &addr, unsigned dims, unsigned size);
+
+      void
+      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned dims, unsigned size);
+
+      fs_reg
+      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+                        const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+   }
+
+   namespace image_access {
+      fs_reg
+      emit_image_load(const fs_builder &bld,
+                      const fs_reg &image, const fs_reg &addr,
+                      unsigned surf_dims, unsigned arr_dims,
+                      mesa_format format);
+
+      void
+      emit_image_store(const fs_builder &bld, const fs_reg &image,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned surf_dims, unsigned arr_dims,
+                       mesa_format format);
+      fs_reg
+      emit_image_atomic(const fs_builder &bld,
+                        const fs_reg &image, const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned surf_dims, unsigned arr_dims,
+                        unsigned rsize, unsigned op);
+   }
+}
+#endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
index 01d3a569858..96d4f375da2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp
@@ -173,7 +173,7 @@ ir_vector_reference_visitor::visit_enter(ir_assignment *ir)
       return visit_continue_with_parent;
    }
    if (ir->lhs->as_dereference_variable() &&
-       is_power_of_two(ir->write_mask) &&
+       _mesa_is_pow_two(ir->write_mask) &&
        !ir->condition) {
       /* If we're writing just a channel, then channel-splitting the LHS is OK.
        */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 9a4bad6bcf5..111db8c4323 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -77,612 +77,6 @@ fs_visitor::emit_vs_system_value(int location)
    return reg;
 }
 
-fs_inst *
-fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg dPdy, int grad_components,
-                              uint32_t sampler)
-{
-   int mlen;
-   int base_mrf = 1;
-   bool simd16 = false;
-   fs_reg orig_dst;
-
-   /* g0 header. */
-   mlen = 1;
-
-   if (shadow_c.file != BAD_FILE) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
-      }
-
-      /* gen4's SIMD8 sampler always has the slots for u,v,r present.
-       * the unused slots must be zeroed.
-       */
-      for (int i = coord_components; i < 3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
-      }
-      mlen += 3;
-
-      if (op == ir_tex) {
-	 /* There's no plain shadow compare message, so we use shadow
-	  * compare with a bias of 0.0.
-	  */
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
-	 mlen++;
-      } else if (op == ir_txb || op == ir_txl) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), lod);
-	 mlen++;
-      } else {
-         unreachable("Should not get here.");
-      }
-
-      bld.MOV(fs_reg(MRF, base_mrf + mlen), shadow_c);
-      mlen++;
-   } else if (op == ir_tex) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
-      }
-      /* zero the others. */
-      for (int i = coord_components; i<3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
-      }
-      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
-      mlen += 3;
-   } else if (op == ir_txd) {
-      fs_reg &dPdx = lod;
-
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
-	 coordinate = offset(coordinate, 1);
-      }
-      /* the slots for u and v are always present, but r is optional */
-      mlen += MAX2(coord_components, 2);
-
-      /*  P   = u, v, r
-       * dPdx = dudx, dvdx, drdx
-       * dPdy = dudy, dvdy, drdy
-       *
-       * 1-arg: Does not exist.
-       *
-       * 2-arg: dudx   dvdx   dudy   dvdy
-       *        dPdx.x dPdx.y dPdy.x dPdy.y
-       *        m4     m5     m6     m7
-       *
-       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
-       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
-       *        m5     m6     m7     m8     m9     m10
-       */
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdx);
-	 dPdx = offset(dPdx, 1);
-      }
-      mlen += MAX2(grad_components, 2);
-
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdy);
-	 dPdy = offset(dPdy, 1);
-      }
-      mlen += MAX2(grad_components, 2);
-   } else if (op == ir_txs) {
-      /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
-      simd16 = true;
-      bld.MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
-      mlen += 2;
-   } else {
-      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
-       * instructions.  We'll need to do SIMD16 here.
-       */
-      simd16 = true;
-      assert(op == ir_txb || op == ir_txl || op == ir_txf);
-
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
-                 coordinate);
-	 coordinate = offset(coordinate, 1);
-      }
-
-      /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
-       * be necessary for TXF (ld), but seems wise to do for all messages.
-       */
-      for (int i = coord_components; i < 3; i++) {
-         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
-      }
-
-      /* lod/bias appears after u/v/r. */
-      mlen += 6;
-
-      bld.MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod);
-      mlen++;
-
-      /* The unused upper half. */
-      mlen++;
-   }
-
-   if (simd16) {
-      /* Now, since we're doing simd16, the return is 2 interleaved
-       * vec4s where the odd-indexed ones are junk. We'll need to move
-       * this weirdness around to the expected layout.
-       */
-      orig_dst = dst;
-      dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   default:
-      unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = base_mrf;
-   inst->mlen = mlen;
-   inst->header_size = 1;
-   inst->regs_written = simd16 ? 8 : 4;
-
-   if (simd16) {
-      for (int i = 0; i < 4; i++) {
-         bld.MOV(orig_dst, dst);
-	 orig_dst = offset(orig_dst, 1);
-	 dst = offset(dst, 2);
-      }
-   }
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
-                                     fs_reg coordinate, int vector_elements,
-                                     fs_reg shadow_c, fs_reg lod,
-                                     uint32_t sampler)
-{
-   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
-   bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf;
-
-   if (has_lod && shadow_c.file != BAD_FILE)
-      no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
-
-   if (op == ir_txd)
-      no16("textureGrad unsupported in SIMD16.");
-
-   /* Copy the coordinates. */
-   for (int i = 0; i < vector_elements; i++) {
-      bld.MOV(retype(offset(message, i), coordinate.type), coordinate);
-      coordinate = offset(coordinate, 1);
-   }
-
-   fs_reg msg_end = offset(message, vector_elements);
-
-   /* Messages other than sample and ld require all three components */
-   if (has_lod || shadow_c.file != BAD_FILE) {
-      for (int i = vector_elements; i < 3; i++) {
-         bld.MOV(offset(message, i), fs_reg(0.0f));
-      }
-   }
-
-   if (has_lod) {
-      fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
-                              BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
-   }
-
-   if (shadow_c.file != BAD_FILE) {
-      fs_reg msg_ref = offset(message, 3 + has_lod);
-      bld.MOV(msg_ref, shadow_c);
-      msg_end = offset(msg_ref, 1);
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB;     break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   default: unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = message.reg - 1;
-   inst->mlen = msg_end.reg - inst->base_mrf;
-   inst->header_size = 1;
-   inst->regs_written = 8;
-
-   return inst;
-}
-
-/* gen5's sampler has slots for u, v, r, array index, then optional
- * parameters like shadow comparitor or LOD bias.  If optional
- * parameters aren't present, those base slots are optional and don't
- * need to be included in the message.
- *
- * We don't fill in the unnecessary slots regardless, which may look
- * surprising in the disassembly.
- */
-fs_inst *
-fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int vector_elements,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, uint32_t sampler,
-                              bool has_offset)
-{
-   int reg_width = dispatch_width / 8;
-   unsigned header_size = 0;
-
-   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
-   fs_reg msg_coords = message;
-
-   if (has_offset) {
-      /* The offsets set up by the ir_texture visitor are in the
-       * m1 header, so we can't go headerless.
-       */
-      header_size = 1;
-      message.reg--;
-   }
-
-   for (int i = 0; i < vector_elements; i++) {
-      bld.MOV(retype(offset(msg_coords, i), coordinate.type), coordinate);
-      coordinate = offset(coordinate, 1);
-   }
-   fs_reg msg_end = offset(msg_coords, vector_elements);
-   fs_reg msg_lod = offset(msg_coords, 4);
-
-   if (shadow_c.file != BAD_FILE) {
-      fs_reg msg_shadow = msg_lod;
-      bld.MOV(msg_shadow, shadow_c);
-      msg_lod = offset(msg_shadow, 1);
-      msg_end = msg_lod;
-   }
-
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex:
-      opcode = SHADER_OPCODE_TEX;
-      break;
-   case ir_txb:
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
-
-      opcode = FS_OPCODE_TXB;
-      break;
-   case ir_txl:
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
-
-      opcode = SHADER_OPCODE_TXL;
-      break;
-   case ir_txd: {
-      /**
-       *  P   =  u,    v,    r
-       * dPdx = dudx, dvdx, drdx
-       * dPdy = dudy, dvdy, drdy
-       *
-       * Load up these values:
-       * - dudx   dudy   dvdx   dvdy   drdx   drdy
-       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
-       */
-      msg_end = msg_lod;
-      for (int i = 0; i < grad_components; i++) {
-         bld.MOV(msg_end, lod);
-         lod = offset(lod, 1);
-         msg_end = offset(msg_end, 1);
-
-         bld.MOV(msg_end, lod2);
-         lod2 = offset(lod2, 1);
-         msg_end = offset(msg_end, 1);
-      }
-
-      opcode = SHADER_OPCODE_TXD;
-      break;
-   }
-   case ir_txs:
-      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
-      bld.MOV(msg_lod, lod);
-      msg_end = offset(msg_lod, 1);
-
-      opcode = SHADER_OPCODE_TXS;
-      break;
-   case ir_query_levels:
-      msg_lod = msg_end;
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      msg_end = offset(msg_lod, 1);
-
-      opcode = SHADER_OPCODE_TXS;
-      break;
-   case ir_txf:
-      msg_lod = offset(msg_coords, 3);
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
-      msg_end = offset(msg_lod, 1);
-
-      opcode = SHADER_OPCODE_TXF;
-      break;
-   case ir_txf_ms:
-      msg_lod = offset(msg_coords, 3);
-      /* lod */
-      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      /* sample index */
-      bld.MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index);
-      msg_end = offset(msg_lod, 2);
-
-      opcode = SHADER_OPCODE_TXF_CMS;
-      break;
-   case ir_lod:
-      opcode = SHADER_OPCODE_LOD;
-      break;
-   case ir_tg4:
-      opcode = SHADER_OPCODE_TG4;
-      break;
-   default:
-      unreachable("not reached");
-   }
-
-   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
-   inst->base_mrf = message.reg;
-   inst->mlen = msg_end.reg - message.reg;
-   inst->header_size = header_size;
-   inst->regs_written = 4 * reg_width;
-
-   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
-      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
-           " disallowed by hardware\n");
-   }
-
-   return inst;
-}
-
-static bool
-is_high_sampler(const struct brw_device_info *devinfo, fs_reg sampler)
-{
-   if (devinfo->gen < 8 && !devinfo->is_haswell)
-      return false;
-
-   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
-}
-
-fs_inst *
-fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
-                              fs_reg coordinate, int coord_components,
-                              fs_reg shadow_c,
-                              fs_reg lod, fs_reg lod2, int grad_components,
-                              fs_reg sample_index, fs_reg mcs, fs_reg sampler,
-                              fs_reg offset_value)
-{
-   int reg_width = dispatch_width / 8;
-   unsigned header_size = 0;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
-   for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
-      sources[i] = vgrf(glsl_type::float_type);
-   }
-   int length = 0;
-
-   if (op == ir_tg4 || offset_value.file != BAD_FILE ||
-       is_high_sampler(devinfo, sampler)) {
-      /* For general texture offsets (no txf workaround), we need a header to
-       * put them in.  Note that for SIMD16 we're making space for two actual
-       * hardware registers here, so the emit will have to fix up for this.
-       *
-       * * ir4_tg4 needs to place its channel select in the header,
-       * for interaction with ARB_texture_swizzle
-       *
-       * The sampler index is only 4-bits, so for larger sampler numbers we
-       * need to offset the Sampler State Pointer in the header.
-       */
-      header_size = 1;
-      sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-      length++;
-   }
-
-   if (shadow_c.file != BAD_FILE) {
-      bld.MOV(sources[length], shadow_c);
-      length++;
-   }
-
-   bool has_nonconstant_offset =
-      offset_value.file != BAD_FILE && offset_value.file != IMM;
-   bool coordinate_done = false;
-
-   /* The sampler can only meaningfully compute LOD for fragment shader
-    * messages. For all other stages, we change the opcode to ir_txl and
-    * hardcode the LOD to 0.
-    */
-   if (stage != MESA_SHADER_FRAGMENT && op == ir_tex) {
-      op = ir_txl;
-      lod = fs_reg(0.0f);
-   }
-
-   /* Set up the LOD info */
-   switch (op) {
-   case ir_tex:
-   case ir_lod:
-      break;
-   case ir_txb:
-      bld.MOV(sources[length], lod);
-      length++;
-      break;
-   case ir_txl:
-      bld.MOV(sources[length], lod);
-      length++;
-      break;
-   case ir_txd: {
-      no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
-
-      /* Load dPdx and the coordinate together:
-       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
-       */
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(sources[length], coordinate);
-	 coordinate = offset(coordinate, 1);
-	 length++;
-
-         /* For cube map array, the coordinate is (u,v,r,ai) but there are
-          * only derivatives for (u, v, r).
-          */
-         if (i < grad_components) {
-            bld.MOV(sources[length], lod);
-            lod = offset(lod, 1);
-            length++;
-
-            bld.MOV(sources[length], lod2);
-            lod2 = offset(lod2, 1);
-            length++;
-         }
-      }
-
-      coordinate_done = true;
-      break;
-   }
-   case ir_txs:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
-      length++;
-      break;
-   case ir_query_levels:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u));
-      length++;
-      break;
-   case ir_txf:
-      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
-       * On Gen9 they are u, v, lod, r
-       */
-
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-      coordinate = offset(coordinate, 1);
-      length++;
-
-      if (devinfo->gen >= 9) {
-         if (coord_components >= 2) {
-            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-            coordinate = offset(coordinate, 1);
-         }
-         length++;
-      }
-
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
-      length++;
-
-      for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-	 coordinate = offset(coordinate, 1);
-	 length++;
-      }
-
-      coordinate_done = true;
-      break;
-   case ir_txf_ms:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
-      length++;
-
-      /* data from the multisample control surface */
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
-      length++;
-
-      /* there is no offsetting for this message; just copy in the integer
-       * texture coordinates
-       */
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
-         coordinate = offset(coordinate, 1);
-         length++;
-      }
-
-      coordinate_done = true;
-      break;
-   case ir_tg4:
-      if (has_nonconstant_offset) {
-         if (shadow_c.file != BAD_FILE)
-            no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
-
-         /* More crazy intermixing */
-         for (int i = 0; i < 2; i++) { /* u, v */
-            bld.MOV(sources[length], coordinate);
-            coordinate = offset(coordinate, 1);
-            length++;
-         }
-
-         for (int i = 0; i < 2; i++) { /* offu, offv */
-            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
-            offset_value = offset(offset_value, 1);
-            length++;
-         }
-
-         if (coord_components == 3) { /* r if present */
-            bld.MOV(sources[length], coordinate);
-            coordinate = offset(coordinate, 1);
-            length++;
-         }
-
-         coordinate_done = true;
-      }
-      break;
-   }
-
-   /* Set up the coordinate (except for cases where it was done above) */
-   if (!coordinate_done) {
-      for (int i = 0; i < coord_components; i++) {
-         bld.MOV(sources[length], coordinate);
-         coordinate = offset(coordinate, 1);
-         length++;
-      }
-   }
-
-   int mlen;
-   if (reg_width == 2)
-      mlen = length * reg_width - header_size;
-   else
-      mlen = length * reg_width;
-
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_F, dispatch_width);
-   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
-
-   /* Generate the SEND */
-   enum opcode opcode;
-   switch (op) {
-   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
-   case ir_txb: opcode = FS_OPCODE_TXB; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
-   case ir_lod: opcode = SHADER_OPCODE_LOD; break;
-   case ir_tg4:
-      if (has_nonconstant_offset)
-         opcode = SHADER_OPCODE_TG4_OFFSET;
-      else
-         opcode = SHADER_OPCODE_TG4;
-      break;
-   default:
-      unreachable("not reached");
-   }
-   fs_inst *inst = bld.emit(opcode, dst, src_payload, sampler);
-   inst->base_mrf = -1;
-   inst->mlen = mlen;
-   inst->header_size = header_size;
-   inst->regs_written = 4 * reg_width;
-
-   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
-      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
-           " disallowed by hardware\n");
-   }
-
-   return inst;
-}
-
 fs_reg
 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
                              bool is_rect, uint32_t sampler, int texunit)
@@ -746,8 +140,8 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       coordinate = dst;
 
       bld.MUL(dst, src, scale_x);
-      dst = offset(dst, 1);
-      src = offset(src, 1);
+      dst = offset(dst, bld, 1);
+      src = offset(src, bld, 1);
       bld.MUL(dst, src, scale_y);
    } else if (is_rect) {
       /* On gen6+, the sampler handles the rectangle coordinates
@@ -760,7 +154,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       for (int i = 0; i < 2; i++) {
 	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 	    fs_reg chan = coordinate;
-	    chan = offset(chan, i);
+	    chan = offset(chan, bld, i);
 
             set_condmod(BRW_CONDITIONAL_GE,
                         bld.emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f)));
@@ -785,7 +179,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       for (int i = 0; i < MIN2(coord_components, 3); i++) {
 	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 	    fs_reg chan = coordinate;
-	    chan = offset(chan, i);
+	    chan = offset(chan, bld, i);
             set_saturate(true, bld.MOV(chan, chan));
 	 }
       }
@@ -795,31 +189,21 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 
 /* Sample from the MCS surface attached to this multisample texture. */
 fs_reg
-fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
+fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+                           const fs_reg &sampler)
 {
-   int reg_width = dispatch_width / 8;
-   fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
-                           BRW_REGISTER_TYPE_F, dispatch_width);
-   fs_reg dest = vgrf(glsl_type::uvec4_type);
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
+   const fs_reg dest = vgrf(glsl_type::uvec4_type);
+   const fs_reg srcs[] = {
+      coordinate, fs_reg(), fs_reg(), fs_reg(), fs_reg(), fs_reg(),
+      sampler, fs_reg(), fs_reg(components), fs_reg(0)
+   };
+   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
+                            ARRAY_SIZE(srcs));
 
-   /* parameters are: u, v, r; missing parameters are treated as zero */
-   for (int i = 0; i < components; i++) {
-      sources[i] = vgrf(glsl_type::float_type);
-      bld.MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate);
-      coordinate = offset(coordinate, 1);
-   }
-
-   bld.LOAD_PAYLOAD(payload, sources, components, 0);
-
-   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
-   inst->base_mrf = -1;
-   inst->mlen = components * reg_width;
-   inst->header_size = 0;
-   inst->regs_written = 4 * reg_width; /* we only care about one reg of
-                                        * response, but the sampler always
-                                        * writes 4/8
-                                        */
+   /* We only care about one reg of response, but the sampler always writes
+    * 4/8.
+    */
+   inst->regs_written = 4 * dispatch_width / 8;
 
    return dest;
 }
@@ -853,12 +237,20 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 
          for (int i=0; i<4; i++) {
             bld.MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
-            res = offset(res, 1);
+            res = offset(res, bld, 1);
          }
          return;
       }
    }
 
+   if (op == ir_query_levels) {
+      /* textureQueryLevels() is implemented in terms of TXS so we need to
+       * pass a valid LOD argument.
+       */
+      assert(lod.file == BAD_FILE);
+      lod = fs_reg(0u);
+   }
+
    if (coordinate.file != BAD_FILE) {
       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
@@ -871,26 +263,50 @@ fs_visitor::emit_texture(ir_texture_opcode op,
     * samples, so don't worry about them.
     */
    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
+   const fs_reg srcs[] = {
+      coordinate, shadow_c, lod, lod2,
+      sample_index, mcs, sampler_reg, offset_value,
+      fs_reg(coord_components), fs_reg(grad_components)
+   };
+   enum opcode opcode;
 
-   if (devinfo->gen >= 7) {
-      inst = emit_texture_gen7(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sample_index, mcs, sampler_reg,
-                               offset_value);
-   } else if (devinfo->gen >= 5) {
-      inst = emit_texture_gen5(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sample_index, sampler,
-                               offset_value.file != BAD_FILE);
-   } else if (dispatch_width == 16) {
-      inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
-                                      shadow_c, lod, sampler);
-   } else {
-      inst = emit_texture_gen4(op, dst, coordinate, coord_components,
-                               shadow_c, lod, lod2, grad_components,
-                               sampler);
+   switch (op) {
+   case ir_tex:
+      opcode = SHADER_OPCODE_TEX_LOGICAL;
+      break;
+   case ir_txb:
+      opcode = FS_OPCODE_TXB_LOGICAL;
+      break;
+   case ir_txl:
+      opcode = SHADER_OPCODE_TXL_LOGICAL;
+      break;
+   case ir_txd:
+      opcode = SHADER_OPCODE_TXD_LOGICAL;
+      break;
+   case ir_txf:
+      opcode = SHADER_OPCODE_TXF_LOGICAL;
+      break;
+   case ir_txf_ms:
+      opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+      break;
+   case ir_txs:
+   case ir_query_levels:
+      opcode = SHADER_OPCODE_TXS_LOGICAL;
+      break;
+   case ir_lod:
+      opcode = SHADER_OPCODE_LOD_LOGICAL;
+      break;
+   case ir_tg4:
+      opcode = (offset_value.file != BAD_FILE && offset_value.file != IMM ?
+                SHADER_OPCODE_TG4_OFFSET_LOGICAL : SHADER_OPCODE_TG4_LOGICAL);
+      break;
+   default:
+      unreachable("Invalid texture opcode.");
    }
 
+   inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+   inst->regs_written = 4 * dispatch_width / 8;
+
    if (shadow_c.file != BAD_FILE)
       inst->shadow_compare = true;
 
@@ -907,17 +323,17 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 
    /* fixup #layers for cube map arrays */
    if (op == ir_txs && is_cube_array) {
-      fs_reg depth = offset(dst, 2);
+      fs_reg depth = offset(dst, bld, 2);
       fs_reg fixed_depth = vgrf(glsl_type::int_type);
       bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 
       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
-      int components = inst->regs_written / (dst.width / 8);
+      int components = inst->regs_written / (inst->exec_size / 8);
       for (int i = 0; i < components; i++) {
          if (i == 2) {
             fixed_payload[i] = fixed_depth;
          } else {
-            fixed_payload[i] = offset(dst, i);
+            fixed_payload[i] = offset(dst, bld, i);
          }
       }
       bld.LOAD_PAYLOAD(dst, fixed_payload, components, 0);
@@ -952,7 +368,7 @@ fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
          bld.ASR(dst, dst, fs_reg(32 - width));
       }
 
-      dst = offset(dst, 1);
+      dst = offset(dst, bld, 1);
    }
 }
 
@@ -989,7 +405,7 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
 {
    if (op == ir_query_levels) {
       /* # levels is in .w */
-      this->result = offset(orig_val, 3);
+      this->result = offset(orig_val, bld, 3);
       return;
    }
 
@@ -1010,15 +426,15 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
       for (int i = 0; i < 4; i++) {
 	 int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
 	 fs_reg l = swizzled_result;
-	 l = offset(l, i);
+	 l = offset(l, bld, i);
 
 	 if (swiz == SWIZZLE_ZERO) {
             bld.MOV(l, fs_reg(0.0f));
 	 } else if (swiz == SWIZZLE_ONE) {
             bld.MOV(l, fs_reg(1.0f));
 	 } else {
-            bld.MOV(l, offset(orig_val,
-                              GET_SWZ(key_tex->swizzles[sampler], i)));
+            bld.MOV(l, offset(orig_val, bld,
+                                  GET_SWZ(key_tex->swizzles[sampler], i)));
 	 }
       }
       this->result = swizzled_result;
@@ -1114,118 +530,6 @@ fs_visitor::try_replace_with_sel()
    return false;
 }
 
-void
-fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                                fs_reg dst, fs_reg offset, fs_reg src0,
-                                fs_reg src1)
-{
-   int reg_width = dispatch_width / 8;
-   int length = 0;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
-
-   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   /* Initialize the sample mask in the message header. */
-   bld.exec_all().MOV(sources[0], fs_reg(0u));
-
-   if (stage == MESA_SHADER_FRAGMENT) {
-      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         bld.exec_all()
-            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
-      } else {
-         bld.exec_all()
-            .MOV(component(sources[0], 7),
-                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
-      }
-   } else {
-      /* The execution mask is part of the side-band information sent together with
-       * the message payload to the data port. It's implicitly ANDed with the sample
-       * mask sent in the header to compute the actual set of channels that execute
-       * the atomic operation.
-       */
-      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      bld.exec_all()
-         .MOV(component(sources[0], 7), fs_reg(0xffffu));
-   }
-   length++;
-
-   /* Set the atomic operation offset. */
-   sources[1] = vgrf(glsl_type::uint_type);
-   bld.MOV(sources[1], offset);
-   length++;
-
-   /* Set the atomic operation arguments. */
-   if (src0.file != BAD_FILE) {
-      sources[length] = vgrf(glsl_type::uint_type);
-      bld.MOV(sources[length], src0);
-      length++;
-   }
-
-   if (src1.file != BAD_FILE) {
-      sources[length] = vgrf(glsl_type::uint_type);
-      bld.MOV(sources[length], src1);
-      length++;
-   }
-
-   int mlen = 1 + (length - 1) * reg_width;
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD, dispatch_width);
-   bld.LOAD_PAYLOAD(src_payload, sources, length, 1);
-
-   /* Emit the instruction. */
-   fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
-                            fs_reg(surf_index), fs_reg(atomic_op));
-   inst->mlen = mlen;
-}
-
-void
-fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
-                                      fs_reg offset)
-{
-   int reg_width = dispatch_width / 8;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
-
-   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   /* Initialize the sample mask in the message header. */
-   bld.exec_all()
-      .MOV(sources[0], fs_reg(0u));
-
-   if (stage == MESA_SHADER_FRAGMENT) {
-      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         bld.exec_all()
-            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
-      } else {
-         bld.exec_all()
-            .MOV(component(sources[0], 7),
-                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
-      }
-   } else {
-      /* The execution mask is part of the side-band information sent together with
-       * the message payload to the data port. It's implicitly ANDed with the sample
-       * mask sent in the header to compute the actual set of channels that execute
-       * the atomic operation.
-       */
-      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      bld.exec_all()
-         .MOV(component(sources[0], 7), fs_reg(0xffffu));
-   }
-
-   /* Set the surface read offset. */
-   sources[1] = vgrf(glsl_type::uint_type);
-   bld.MOV(sources[1], offset);
-
-   int mlen = 1 + reg_width;
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD, dispatch_width);
-   fs_inst *inst = bld.LOAD_PAYLOAD(src_payload, sources, 2, 1);
-
-   /* Emit the instruction. */
-   inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
-                   fs_reg(surf_index), fs_reg(1));
-   inst->mlen = mlen;
-}
-
 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 void
 fs_visitor::emit_dummy_fs()
@@ -1235,8 +539,8 @@ fs_visitor::emit_dummy_fs()
    /* Everyone's favorite color. */
    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
    for (int i = 0; i < 4; i++) {
-      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
-                     dispatch_width), fs_reg(color[i]));
+      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
+              fs_reg(color[i]));
    }
 
    fs_inst *write;
@@ -1315,14 +619,14 @@ fs_visitor::emit_interpolation_setup_gen4()
 
    if (devinfo->has_pln && dispatch_width == 16) {
       for (unsigned i = 0; i < 2; i++) {
-         abld.half(i).ADD(half(offset(delta_xy, i), 0),
+         abld.half(i).ADD(half(offset(delta_xy, abld, i), 0),
                           half(this->pixel_x, i), xstart);
-         abld.half(i).ADD(half(offset(delta_xy, i), 1),
+         abld.half(i).ADD(half(offset(delta_xy, abld, i), 1),
                           half(this->pixel_y, i), ystart);
       }
    } else {
-      abld.ADD(offset(delta_xy, 0), this->pixel_x, xstart);
-      abld.ADD(offset(delta_xy, 1), this->pixel_y, ystart);
+      abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
+      abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
    }
 
    abld = bld.annotate("compute pos.w and 1/pos.w");
@@ -1356,9 +660,10 @@ fs_visitor::emit_interpolation_setup_gen6()
        * compute our pixel centers.
        */
       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
-                          BRW_REGISTER_TYPE_UW, dispatch_width * 2);
-      abld.exec_all()
-          .ADD(int_pixel_xy,
+                          BRW_REGISTER_TYPE_UW);
+
+      const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0);
+      dbld.ADD(int_pixel_xy,
                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
                fs_reg(brw_imm_v(0x11001010)));
 
@@ -1407,33 +712,6 @@ fs_visitor::emit_interpolation_setup_gen6()
    }
 }
 
-void
-fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
-                                unsigned exec_size, bool use_2nd_half)
-{
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   fs_inst *inst;
-
-   if (key->clamp_fragment_color) {
-      fs_reg tmp = vgrf(glsl_type::vec4_type);
-      assert(color.type == BRW_REGISTER_TYPE_F);
-      for (unsigned i = 0; i < components; i++) {
-         inst = bld.MOV(offset(tmp, i), offset(color, i));
-         inst->saturate = true;
-      }
-      color = tmp;
-   }
-
-   if (exec_size < dispatch_width) {
-      unsigned half_idx = use_2nd_half ? 1 : 0;
-      for (unsigned i = 0; i < components; i++)
-         dst[i] = half(offset(color, i), half_idx);
-   } else {
-      for (unsigned i = 0; i < components; i++)
-         dst[i] = offset(color, i);
-   }
-}
-
 static enum brw_conditional_mod
 cond_for_alpha_func(GLenum func)
 {
@@ -1478,7 +756,7 @@ fs_visitor::emit_alpha_test()
                      BRW_CONDITIONAL_NEQ);
    } else {
       /* RT0 alpha */
-      fs_reg color = offset(outputs[0], 3);
+      fs_reg color = offset(outputs[0], bld, 3);
 
       /* f0.1 &= func(color, ref) */
       cmp = abld.CMP(bld.null_reg_f(), color, fs_reg(key->alpha_test_ref),
@@ -1491,152 +769,36 @@ fs_visitor::emit_alpha_test()
 fs_inst *
 fs_visitor::emit_single_fb_write(const fs_builder &bld,
                                  fs_reg color0, fs_reg color1,
-                                 fs_reg src0_alpha, unsigned components,
-                                 unsigned exec_size, bool use_2nd_half)
+                                 fs_reg src0_alpha, unsigned components)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   int header_size = 2, payload_header_size;
 
-   /* We can potentially have a message length of up to 15, so we have to set
-    * base_mrf to either 0 or 1 in order to fit in m0..m15.
-    */
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
-   int length = 0;
-
-   /* From the Sandy Bridge PRM, volume 4, page 198:
-    *
-    *     "Dispatched Pixel Enables. One bit per pixel indicating
-    *      which pixels were originally enabled when the thread was
-    *      dispatched. This field is only required for the end-of-
-    *      thread message and on all dual-source messages."
-    */
-   if (devinfo->gen >= 6 &&
-       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
-       color1.file == BAD_FILE &&
-       key->nr_color_regions == 1) {
-      header_size = 0;
-   }
-
-   if (header_size != 0) {
-      assert(header_size == 2);
-      /* Allocate 2 registers for a header */
-      length += 2;
-   }
-
-   if (payload.aa_dest_stencil_reg) {
-      sources[length] = fs_reg(GRF, alloc.allocate(1));
-      bld.exec_all().annotate("FB write stencil/AA alpha")
-         .MOV(sources[length],
-              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
-      length++;
-   }
-
-   prog_data->uses_omask =
-      prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
-   if (prog_data->uses_omask) {
-      assert(this->sample_mask.file != BAD_FILE);
-      /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
-       * it's unsinged single words, one vgrf is always 16-wide.
-       */
-      sources[length] = fs_reg(GRF, alloc.allocate(1),
-                               BRW_REGISTER_TYPE_UW, 16);
-      bld.exec_all().annotate("FB write oMask")
-         .emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
-      length++;
-   }
-
-   payload_header_size = length;
-
-   if (color0.file == BAD_FILE) {
-      /* Even if there's no color buffers enabled, we still need to send
-       * alpha out the pipeline to our null renderbuffer to support
-       * alpha-testing, alpha-to-coverage, and so on.
-       */
-      if (this->outputs[0].file != BAD_FILE)
-         setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3),
-                             1, exec_size, false);
-      length += 4;
-   } else if (color1.file == BAD_FILE) {
-      if (src0_alpha.file != BAD_FILE) {
-         setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
-         length++;
-      }
-
-      setup_color_payload(&sources[length], color0, components,
-                          exec_size, use_2nd_half);
-      length += 4;
-   } else {
-      setup_color_payload(&sources[length], color0, components,
-                          exec_size, use_2nd_half);
-      length += 4;
-      setup_color_payload(&sources[length], color1, components,
-                          exec_size, use_2nd_half);
-      length += 4;
-   }
+   /* Hand over gl_FragDepth or the payload depth. */
+   const fs_reg dst_depth = (payload.dest_depth_reg ?
+                             fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) :
+                             fs_reg());
+   fs_reg src_depth;
 
    if (source_depth_to_render_target) {
-      if (devinfo->gen == 6) {
-	 /* For outputting oDepth on gen6, SIMD8 writes have to be
-	  * used.  This would require SIMD8 moves of each half to
-	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
-	  * Just bail on doing so for now.
-	  */
-	 no16("Missing support for simd16 depth writes on gen6\n");
-      }
-
-      if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-	 /* Hand over gl_FragDepth. */
-	 assert(this->frag_depth.file != BAD_FILE);
-         if (exec_size < dispatch_width) {
-            sources[length] = half(this->frag_depth, use_2nd_half);
-         } else {
-            sources[length] = this->frag_depth;
-         }
-      } else {
-	 /* Pass through the payload depth. */
-         sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
-      }
-      length++;
+      if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+         src_depth = frag_depth;
+      else
+         src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
    }
 
-   if (payload.dest_depth_reg)
-      sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
+   const fs_reg sources[] = {
+      color0, color1, src0_alpha, src_depth, dst_depth, sample_mask,
+      fs_reg(components)
+   };
+   fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
+                             sources, ARRAY_SIZE(sources));
 
-   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
-   fs_inst *load;
-   fs_inst *write;
-   if (devinfo->gen >= 7) {
-      /* Send from the GRF */
-      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
-      load = ubld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
-      payload.reg = alloc.allocate(load->regs_written);
-      load->dst = payload;
-      write = ubld.emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
-      write->base_mrf = -1;
-   } else {
-      /* Send from the MRF */
-      load = ubld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
-                               sources, length, payload_header_size);
-
-      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
-       * will do this for us if we just give it a COMPR4 destination.
-       */
-      if (devinfo->gen < 6 && exec_size == 16)
-         load->dst.reg |= BRW_MRF_COMPR4;
-
-      write = ubld.emit(FS_OPCODE_FB_WRITE);
-      write->exec_size = exec_size;
-      write->base_mrf = 1;
-   }
-
-   write->mlen = load->regs_written;
-   write->header_size = header_size;
    if (prog_data->uses_kill) {
       write->predicate = BRW_PREDICATE_NORMAL;
       write->flag_subreg = 1;
    }
+
    return write;
 }
 
@@ -1648,37 +810,24 @@ fs_visitor::emit_fb_writes()
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
    fs_inst *inst = NULL;
+
+   if (source_depth_to_render_target && devinfo->gen == 6) {
+      /* For outputting oDepth on gen6, SIMD8 writes have to be used.  This
+       * would require SIMD8 moves of each half to message regs, e.g. by using
+       * the SIMD lowering pass.  Unfortunately this is more difficult than it
+       * sounds because the SIMD8 single-source message lacks channel selects
+       * for the second and third subspans.
+       */
+      no16("Missing support for simd16 depth writes on gen6\n");
+   }
+
    if (do_dual_src) {
       const fs_builder abld = bld.annotate("FB dual-source write");
 
       inst = emit_single_fb_write(abld, this->outputs[0],
-                                  this->dual_src_output, reg_undef, 4, 8);
+                                  this->dual_src_output, reg_undef, 4);
       inst->target = 0;
 
-      /* SIMD16 dual source blending requires to send two SIMD8 dual source
-       * messages, where each message contains color data for 8 pixels. Color
-       * data for the first group of pixels is stored in the "lower" half of
-       * the color registers, so in SIMD16, the previous message did:
-       * m + 0: r0
-       * m + 1: g0
-       * m + 2: b0
-       * m + 3: a0
-       *
-       * Here goes the second message, which packs color data for the
-       * remaining 8 pixels. Color data for these pixels is stored in the
-       * "upper" half of the color registers, so we need to do:
-       * m + 0: r1
-       * m + 1: g1
-       * m + 2: b1
-       * m + 3: a1
-       */
-      if (dispatch_width == 16) {
-         inst = emit_single_fb_write(abld, this->outputs[0],
-                                     this->dual_src_output, reg_undef, 4, 8,
-                                     true);
-         inst->target = 0;
-      }
-
       prog_data->dual_src_blend = true;
    } else {
       for (int target = 0; target < key->nr_color_regions; target++) {
@@ -1691,12 +840,11 @@ fs_visitor::emit_fb_writes()
 
          fs_reg src0_alpha;
          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
-            src0_alpha = offset(outputs[0], 3);
+            src0_alpha = offset(outputs[0], bld, 3);
 
          inst = emit_single_fb_write(abld, this->outputs[target], reg_undef,
                                      src0_alpha,
-                                     this->output_components[target],
-                                     dispatch_width);
+                                     this->output_components[target]);
          inst->target = target;
       }
    }
@@ -1706,8 +854,15 @@ fs_visitor::emit_fb_writes()
        * alpha out the pipeline to our null renderbuffer to support
        * alpha-testing, alpha-to-coverage, and so on.
        */
-      inst = emit_single_fb_write(bld, reg_undef, reg_undef, reg_undef, 0,
-                                  dispatch_width);
+      /* FINISHME: Factor out this frequently recurring pattern into a
+       * helper function.
+       */
+      const fs_reg srcs[] = { reg_undef, reg_undef,
+                              reg_undef, offset(this->outputs[0], bld, 3) };
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+      bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
+
+      inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
       inst->target = 0;
    }
 
@@ -1730,6 +885,12 @@ fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
    }
 }
 
+/**
+ * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances.
+ *
+ * This does nothing if the shader uses gl_ClipDistance or user clipping is
+ * disabled altogether.
+ */
 void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
 {
    struct brw_vue_prog_data *vue_prog_data =
@@ -1737,6 +898,10 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
    const struct brw_vue_prog_key *key =
       (const struct brw_vue_prog_key *) this->key;
 
+   /* Bail unless some sort of legacy clipping is enabled */
+   if (!key->userclip_active || prog->UsesClipDistanceOut)
+      return;
+
    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
     *
     *     "If a linked set of shaders forming the vertex stage contains no
@@ -1774,13 +939,13 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
       abld.MUL(output, outputs[clip_vertex], u);
       for (int j = 1; j < 4; j++) {
          u.reg = userplane[i].reg + j;
-         abld.MAD(output, output, offset(outputs[clip_vertex], j), u);
+         abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u);
       }
    }
 }
 
 void
-fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
+fs_visitor::emit_urb_writes()
 {
    int slot, urb_offset, length;
    struct brw_vs_prog_data *vs_prog_data =
@@ -1793,21 +958,24 @@ fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
    bool flush;
    fs_reg sources[8];
 
-   /* Lower legacy ff and ClipVertex clipping to clip distances */
-   if (key->base.userclip_active && !prog->UsesClipDistanceOut)
-      compute_clip_distance(clip_planes);
-
    /* If we don't have any valid slots to write, just do a minimal urb write
-    * send to terminate the shader. */
+    * send to terminate the shader.  This includes 1 slot of undefined data,
+    * because it's invalid to write 0 data:
+    *
+    * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
+    * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
+    * Write Data Payload:
+    *
+    *    "The write data payload can be between 1 and 8 message phases long."
+    */
    if (vue_map->slots_valid == 0) {
-
-      fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      fs_reg payload = fs_reg(GRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
       bld.exec_all().MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
                                                 BRW_REGISTER_TYPE_UD)));
 
       fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
       inst->eot = true;
-      inst->mlen = 1;
+      inst->mlen = 2;
       inst->offset = 1;
       return;
    }
@@ -1888,13 +1056,13 @@ fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
              */
             for (int i = 0; i < 4; i++) {
                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
-               src = offset(this->outputs[varying], i);
+               src = offset(this->outputs[varying], bld, i);
                set_saturate(true, bld.MOV(reg, src));
                sources[length++] = reg;
             }
          } else {
             for (int i = 0; i < 4; i++)
-               sources[length++] = offset(this->outputs[varying], i);
+               sources[length++] = offset(this->outputs[varying], bld, i);
          }
          break;
       }
@@ -1911,7 +1079,7 @@ fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
       if (flush) {
          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
-                                 BRW_REGISTER_TYPE_F, dispatch_width);
+                                 BRW_REGISTER_TYPE_F);
          payload_sources[0] =
             fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
@@ -1944,7 +1112,7 @@ fs_visitor::emit_cs_terminate()
     */
    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   bld.exec_all().MOV(payload, g0);
+   bld.group(8, 0).exec_all().MOV(payload, g0);
 
    /* Send a message to the thread spawner to terminate the thread. */
    fs_inst *inst = bld.exec_all()
@@ -2012,7 +1180,7 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
    this->no16_msg = NULL;
 
    this->nir_locals = NULL;
-   this->nir_globals = NULL;
+   this->nir_ssa_values = NULL;
 
    memset(&this->payload, 0, sizeof(this->payload));
    memset(this->outputs, 0, sizeof(this->outputs));
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 45c132b4a9e..4ad65215756 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -68,12 +68,16 @@ brw_compile_gs_prog(struct brw_context *brw,
 
    /* We also upload clip plane data as uniforms */
    param_count += MAX_CLIP_PLANES * 4;
+   param_count += gs->NumImages * BRW_IMAGE_PARAM_SIZE;
 
    c.prog_data.base.base.param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    c.prog_data.base.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   c.prog_data.base.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, gs->NumImages);
    c.prog_data.base.base.nr_params = param_count;
+   c.prog_data.base.base.nr_image_params = gs->NumImages;
 
    if (brw->gen >= 7) {
       if (gp->program.OutputType == GL_POINTS) {
@@ -270,16 +274,6 @@ brw_compile_gs_prog(struct brw_context *brw,
       return false;
    }
 
-   /* Scratch space is used for register spilling */
-   if (c.base.last_scratch) {
-      perf_debug("Geometry shader triggered register spilling.  "
-                 "Try reducing the number of live vec4 values to "
-                 "improve performance.\n");
-
-      c.prog_data.base.base.total_scratch
-         = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
-   }
-
    output->mem_ctx = mem_ctx;
    output->program = program;
    output->program_size = program_size;
diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
index 0b8bfc3d9bd..0bb307432d0 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
@@ -119,3 +119,28 @@ const struct brw_tracked_state brw_gs_abo_surfaces = {
    },
    .emit = brw_upload_gs_abo_surfaces,
 };
+
+static void
+brw_upload_gs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
+
+   if (prog) {
+      /* BRW_NEW_GS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY],
+                                &brw->gs.base, &brw->gs.prog_data->base.base);
+   }
+}
+
+const struct brw_tracked_state brw_gs_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_GS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS,
+   },
+   .emit = brw_upload_gs_image_surfaces,
+};
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index 7a8c210118c..46eff1dd381 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -683,9 +683,9 @@ brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low)
    high %= 64;
    low %= 64;
 
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = (1ull << (high - low + 1)) - 1;
 
-   return (inst->data[word] & mask) >> low;
+   return (inst->data[word] >> low) & mask;
 }
 
 /**
@@ -702,12 +702,12 @@ brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value)
    high %= 64;
    low %= 64;
 
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
 
    /* Make sure the supplied value actually fits in the given bitfield. */
    assert((value & (mask >> low)) == value);
 
-   inst->data[word] = (inst->data[word] & ~mask) | ((value << low) & mask);
+   inst->data[word] = (inst->data[word] & ~mask) | (value << low);
 }
 
 #undef BRW_IA16_ADDR_IMM
@@ -731,9 +731,9 @@ typedef struct {
 static inline unsigned
 brw_compact_inst_bits(brw_compact_inst *inst, unsigned high, unsigned low)
 {
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = (1ull << (high - low + 1)) - 1;
 
-   return (inst->data & mask) >> low;
+   return (inst->data >> low) & mask;
 }
 
 /**
@@ -745,12 +745,12 @@ static inline void
 brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low,
                           uint64_t value)
 {
-   const uint64_t mask = (((1ull << (high - low + 1)) - 1) << low);
+   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
 
    /* Make sure the supplied value actually fits in the given bitfield. */
    assert((value & (mask >> low)) == value);
 
-   inst->data = (inst->data & ~mask) | ((value << low) & mask);
+   inst->data = (inst->data & ~mask) | (value << low);
 }
 
 #define F(name, high, low)                                      \
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 96dc20da3cf..97c6f8b2500 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -44,11 +44,16 @@ public:
    fs_reg(struct brw_reg fixed_hw_reg);
    fs_reg(enum register_file file, int reg);
    fs_reg(enum register_file file, int reg, enum brw_reg_type type);
-   fs_reg(enum register_file file, int reg, enum brw_reg_type type, uint8_t width);
 
    bool equals(const fs_reg &r) const;
    bool is_contiguous() const;
 
+   /**
+    * Return the size in bytes of a single logical component of the
+    * register assuming the given execution width.
+    */
+   unsigned component_size(unsigned width) const;
+
    /** Smear a channel of the reg to all channels. */
    fs_reg &set_smear(unsigned subreg);
 
@@ -60,14 +65,6 @@ public:
 
    fs_reg *reladdr;
 
-   /**
-    * The register width.  This indicates how many hardware values are
-    * represented by each virtual value.  Valid values are 1, 8, or 16.
-    * For immediate values, this is 1.  Most of the rest of the time, it
-    * will be equal to the dispatch width.
-    */
-   uint8_t width;
-
    /** Register region horizontal stride */
    uint8_t stride;
 };
@@ -128,34 +125,11 @@ horiz_offset(fs_reg reg, unsigned delta)
    return reg;
 }
 
-static inline fs_reg
-offset(fs_reg reg, unsigned delta)
-{
-   switch (reg.file) {
-   case BAD_FILE:
-      break;
-   case GRF:
-   case MRF:
-   case ATTR:
-      return byte_offset(reg,
-                         delta * MAX2(reg.width * reg.stride, 1) *
-                         type_sz(reg.type));
-   case UNIFORM:
-      reg.reg_offset += delta;
-      break;
-   default:
-      assert(delta == 0);
-   }
-   return reg;
-}
-
 static inline fs_reg
 component(fs_reg reg, unsigned idx)
 {
    assert(reg.subreg_offset == 0);
-   assert(idx < reg.width);
    reg.subreg_offset = idx * type_sz(reg.type);
-   reg.width = 1;
    reg.stride = 0;
    return reg;
 }
@@ -163,7 +137,7 @@ component(fs_reg reg, unsigned idx)
 static inline bool
 is_uniform(const fs_reg &reg)
 {
-   return (reg.width == 1 || reg.stride == 0 || reg.is_null()) &&
+   return (reg.stride == 0 || reg.is_null()) &&
           (!reg.reladdr || is_uniform(*reg.reladdr));
 }
 
@@ -185,8 +159,6 @@ half(fs_reg reg, unsigned idx)
 
    case GRF:
    case MRF:
-      assert(reg.width == 16);
-      reg.width = 8;
       return horiz_offset(reg, 8 * idx);
 
    case ATTR:
@@ -210,20 +182,13 @@ public:
 
    fs_inst();
    fs_inst(enum opcode opcode, uint8_t exec_size);
-   fs_inst(enum opcode opcode, const fs_reg &dst);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0, const fs_reg &src1);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-           const fs_reg &src1);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-           const fs_reg &src1, const fs_reg &src2);
-   fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg src[],
-           unsigned sources);
    fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
            const fs_reg src[], unsigned sources);
    fs_inst(const fs_inst &that);
@@ -236,6 +201,7 @@ public:
    bool is_send_from_grf() const;
    bool is_partial_write() const;
    bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
+   unsigned components_read(unsigned i) const;
    int regs_read(int arg) const;
    bool can_do_source_mods(const struct brw_device_info *devinfo);
    bool has_side_effects() const;
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index fceacae0e51..966a410a15d 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -113,6 +113,8 @@ public:
    dst_reg(register_file file, int reg);
    dst_reg(register_file file, int reg, const glsl_type *type,
            unsigned writemask);
+   dst_reg(register_file file, int reg, brw_reg_type type,
+           unsigned writemask);
    dst_reg(struct brw_reg reg);
    dst_reg(class vec4_visitor *v, const struct glsl_type *type);
 
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index 49f2e3e498c..f5ecbb54989 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -128,7 +128,7 @@ brw_bind_rep_write_shader(struct brw_context *brw, float *color)
    _mesa_AttachShader(clear->shader_prog, vs);
    _mesa_DeleteShader(vs);
    _mesa_BindAttribLocation(clear->shader_prog, 0, "position");
-   _mesa_ObjectLabel(GL_PROGRAM, clear->shader_prog, -1, "meta clear");
+   _mesa_ObjectLabel(GL_PROGRAM, clear->shader_prog, -1, "meta repclear");
    _mesa_LinkProgram(clear->shader_prog);
 
    clear->color_location =
@@ -200,7 +200,7 @@ brw_draw_rectlist(struct gl_context *ctx, struct rect *rect, int num_instances)
 
    brw_draw_prims(ctx, &prim, 1, NULL,
                   GL_TRUE, start, start + count - 1,
-                  NULL, NULL);
+                  NULL, 0, NULL);
 }
 
 static void
@@ -348,7 +348,7 @@ is_color_fast_clear_compatible(struct brw_context *brw,
    }
 
    for (int i = 0; i < 4; i++) {
-      if (color->f[i] != 0.0 && color->f[i] != 1.0 &&
+      if (color->f[i] != 0.0f && color->f[i] != 1.0f &&
           _mesa_format_has_color_component(format, i)) {
          return false;
       }
@@ -366,7 +366,7 @@ compute_fast_clear_color_bits(const union gl_color_union *color)
    uint32_t bits = 0;
    for (int i = 0; i < 4; i++) {
       /* Testing for non-0 works for integer and float colors */
-      if (color->f[i] != 0.0)
+      if (color->f[i] != 0.0f)
          bits |= 1 << (GEN7_SURFACE_CLEAR_COLOR_SHIFT + (3 - i));
    }
    return bits;
@@ -623,7 +623,7 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
     *     write-flush must be issued before sending any DRAW commands on that
     *     render target.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* If we had to fall back to plain clear for any buffers, clear those now
     * by calling into meta.
@@ -677,7 +677,7 @@ brw_meta_resolve_color(struct brw_context *brw,
    GLuint fbo, rbo;
    struct rect rect;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_meta_begin(ctx, MESA_META_ALL);
 
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index d079197a2a9..aa6df16eb04 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -239,10 +239,10 @@ setup_coord_coeff(GLuint prog, GLuint multiplier, GLuint offset,
 
    if (mirror) {
       _mesa_Uniform1f(multiplier, -scale);
-      _mesa_Uniform1f(offset, src_0 + (dst_1 - 0.5) * scale);
+      _mesa_Uniform1f(offset, src_0 + (dst_1 - 0.5f) * scale);
    } else {
       _mesa_Uniform1f(multiplier, scale);
-      _mesa_Uniform1f(offset, src_0 + (-dst_0 + 0.5) * scale);
+      _mesa_Uniform1f(offset, src_0 + (-dst_0 + 0.5f) * scale);
    }
 }
 
@@ -500,11 +500,11 @@ brw_meta_fbo_stencil_blit(struct brw_context *brw,
                              .mirror_x = mirror_x, .mirror_y = mirror_y };
    adjust_mip_level(dst_mt, dst_irb->mt_level, dst_irb->mt_layer, &dims);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    _mesa_meta_begin(ctx, MESA_META_ALL);
    brw_meta_stencil_blit(brw,
                          dst_mt, dst_irb->mt_level, dst_irb->mt_layer, &dims);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 void
@@ -524,7 +524,7 @@ brw_meta_stencil_updownsample(struct brw_context *brw,
    if (dst->stencil_mt)
       dst = dst->stencil_mt;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    _mesa_meta_begin(ctx, MESA_META_ALL);
 
    _mesa_GenFramebuffers(1, &fbo);
@@ -535,7 +535,7 @@ brw_meta_stencil_updownsample(struct brw_context *brw,
                                  GL_RENDERBUFFER, rbo);
 
    brw_meta_stencil_blit(brw, dst, 0, 0, &dims);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_DeleteRenderbuffers(1, &rbo);
    _mesa_DeleteFramebuffers(1, &fbo);
diff --git a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
index 21507b1ad2a..f39d50a69e6 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
@@ -116,7 +116,7 @@ brw_meta_updownsample(struct brw_context *brw,
       blit_bit = GL_COLOR_BUFFER_BIT;
    }
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    _mesa_meta_begin(ctx, MESA_META_ALL);
    _mesa_GenFramebuffers(2, fbos);
@@ -147,5 +147,5 @@ brw_meta_updownsample(struct brw_context *brw,
 
    _mesa_meta_end(ctx);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 5a4515b582d..e9d9467d330 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -44,7 +44,8 @@
 #include "main/glformats.h"
 
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
-static void upload_drawing_rect(struct brw_context *brw)
+static void
+upload_drawing_rect(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    const struct gl_framebuffer *fb = ctx->DrawBuffer;
@@ -73,7 +74,8 @@ const struct brw_tracked_state brw_drawing_rect = {
  * The state pointers in this packet are all relative to the general state
  * base address set by CMD_STATE_BASE_ADDRESS, which is 0.
  */
-static void upload_pipelined_state_pointers(struct brw_context *brw )
+static void
+upload_pipelined_state_pointers(struct brw_context *brw)
 {
    if (brw->gen == 5) {
       /* Need to flush before changing clip max threads for errata. */
@@ -104,7 +106,8 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
    brw->ctx.NewDriverState |= BRW_NEW_PSP;
 }
 
-static void upload_psp_urb_cbs(struct brw_context *brw )
+static void
+upload_psp_urb_cbs(struct brw_context *brw)
 {
    upload_pipelined_state_pointers(brw);
    brw_upload_urb_fence(brw);
@@ -580,7 +583,7 @@ brw_emit_depth_stencil_hiz(struct brw_context *brw,
     * non-pipelined state that will need the PIPE_CONTROL workaround.
     */
    if (brw->gen == 6) {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
    }
 
    unsigned int len;
@@ -700,13 +703,11 @@ const struct brw_tracked_state brw_depthbuffer = {
    .emit = brw_emit_depthbuffer,
 };
 
-
-
-/***********************************************************************
+/**
  * Polygon stipple packet
  */
-
-static void upload_polygon_stipple(struct brw_context *brw)
+static void
+upload_polygon_stipple(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    GLuint i;
@@ -728,8 +729,7 @@ static void upload_polygon_stipple(struct brw_context *brw)
    if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
       for (i = 0; i < 32; i++)
 	  OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */
-   }
-   else {
+   } else {
       for (i = 0; i < 32; i++)
 	 OUT_BATCH(ctx->PolygonStipple[i]);
    }
@@ -745,12 +745,11 @@ const struct brw_tracked_state brw_polygon_stipple = {
    .emit = upload_polygon_stipple
 };
 
-
-/***********************************************************************
+/**
  * Polygon stipple offset packet
  */
-
-static void upload_polygon_stipple_offset(struct brw_context *brw)
+static void
+upload_polygon_stipple_offset(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
 
@@ -785,10 +784,11 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
    .emit = upload_polygon_stipple_offset
 };
 
-/**********************************************************************
+/**
  * AA Line parameters
  */
-static void upload_aa_line_parameters(struct brw_context *brw)
+static void
+upload_aa_line_parameters(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
 
@@ -815,11 +815,11 @@ const struct brw_tracked_state brw_aa_line_parameters = {
    .emit = upload_aa_line_parameters
 };
 
-/***********************************************************************
+/**
  * Line stipple packet
  */
-
-static void upload_line_stipple(struct brw_context *brw)
+static void
+upload_line_stipple(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    GLfloat tmp;
@@ -834,13 +834,12 @@ static void upload_line_stipple(struct brw_context *brw)
 
    if (brw->gen >= 7) {
       /* in U1.16 */
-      tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+      tmp = 1.0f / ctx->Line.StippleFactor;
       tmpi = tmp * (1<<16);
       OUT_BATCH(tmpi << 15 | ctx->Line.StippleFactor);
-   }
-   else {
+   } else {
       /* in U1.13 */
-      tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
+      tmp = 1.0f / ctx->Line.StippleFactor;
       tmpi = tmp * (1<<13);
       OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
    }
@@ -856,7 +855,6 @@ const struct brw_tracked_state brw_line_stipple = {
    .emit = upload_line_stipple
 };
 
-
 void
 brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
 {
@@ -872,11 +870,9 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
    ADVANCE_BATCH();
 }
 
-
-/***********************************************************************
+/**
  * Misc invariant state packets
  */
-
 void
 brw_upload_invariant_state(struct brw_context *brw)
 {
@@ -930,7 +926,8 @@ const struct brw_tracked_state brw_invariant_state = {
  * surface state objects, but not the surfaces that the surface state
  * objects point to.
  */
-static void upload_state_base_address( struct brw_context *brw )
+static void
+upload_state_base_address(struct brw_context *brw)
 {
    /* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of
     * vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index e7e16b6686a..79e31d86759 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -27,19 +27,27 @@
 #include "program/prog_to_nir.h"
 
 static void
-nir_optimize(nir_shader *nir)
+nir_optimize(nir_shader *nir, bool is_scalar)
 {
    bool progress;
    do {
       progress = false;
       nir_lower_vars_to_ssa(nir);
       nir_validate_shader(nir);
-      nir_lower_alu_to_scalar(nir);
-      nir_validate_shader(nir);
+
+      if (is_scalar) {
+         nir_lower_alu_to_scalar(nir);
+         nir_validate_shader(nir);
+      }
+
       progress |= nir_copy_prop(nir);
       nir_validate_shader(nir);
-      nir_lower_phis_to_scalar(nir);
-      nir_validate_shader(nir);
+
+      if (is_scalar) {
+         nir_lower_phis_to_scalar(nir);
+         nir_validate_shader(nir);
+      }
+
       progress |= nir_copy_prop(nir);
       nir_validate_shader(nir);
       progress |= nir_opt_dce(nir);
@@ -57,33 +65,12 @@ nir_optimize(nir_shader *nir)
    } while (progress);
 }
 
-static bool
-count_nir_instrs_in_block(nir_block *block, void *state)
-{
-   int *count = (int *) state;
-   nir_foreach_instr(block, instr) {
-      *count = *count + 1;
-   }
-   return true;
-}
-
-static int
-count_nir_instrs(nir_shader *nir)
-{
-   int count = 0;
-   nir_foreach_overload(nir, overload) {
-      if (!overload->impl)
-         continue;
-      nir_foreach_block(overload->impl, count_nir_instrs_in_block, &count);
-   }
-   return count;
-}
-
 nir_shader *
 brw_create_nir(struct brw_context *brw,
                const struct gl_shader_program *shader_prog,
                const struct gl_program *prog,
-               gl_shader_stage stage)
+               gl_shader_stage stage,
+               bool is_scalar)
 {
    struct gl_context *ctx = &brw->ctx;
    const nir_shader_compiler_options *options =
@@ -100,16 +87,15 @@ brw_create_nir(struct brw_context *brw,
    }
    nir_validate_shader(nir);
 
-   brw_process_nir(nir, brw->intelScreen->devinfo, shader_prog, stage);
+   brw_process_nir(nir, brw->intelScreen->devinfo, shader_prog, stage, is_scalar);
 
    static GLuint msg_id = 0;
    _mesa_gl_debug(&brw->ctx, &msg_id,
                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
                   MESA_DEBUG_TYPE_OTHER,
                   MESA_DEBUG_SEVERITY_NOTIFICATION,
-                  "%s NIR shader: %d inst\n",
-                  _mesa_shader_stage_to_abbrev(stage),
-                  count_nir_instrs(nir));
+                  "%s NIR shader:\n",
+                  _mesa_shader_stage_to_abbrev(stage));
 
    return nir;
 }
@@ -118,7 +104,7 @@ void
 brw_process_nir(nir_shader *nir,
                 const struct brw_device_info *devinfo,
                 const struct gl_shader_program *shader_prog,
-                gl_shader_stage stage)
+                gl_shader_stage stage, bool is_scalar)
 {
    bool debug_enabled = INTEL_DEBUG & intel_debug_flag_for_shader_stage(stage);
 
@@ -134,22 +120,33 @@ brw_process_nir(nir_shader *nir,
    nir_split_var_copies(nir);
    nir_validate_shader(nir);
 
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
 
    /* Lower a bunch of stuff */
    nir_lower_var_copies(nir);
    nir_validate_shader(nir);
 
    /* Get rid of split copies */
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
 
-   nir_assign_var_locations_scalar_direct_first(nir, &nir->uniforms,
-                                                &nir->num_direct_uniforms,
-                                                &nir->num_uniforms);
-   nir_assign_var_locations_scalar(&nir->inputs, &nir->num_inputs);
-   nir_assign_var_locations_scalar(&nir->outputs, &nir->num_outputs);
+   if (is_scalar) {
+      nir_assign_var_locations_direct_first(nir, &nir->uniforms,
+                                            &nir->num_direct_uniforms,
+                                            &nir->num_uniforms,
+                                            is_scalar);
+      nir_assign_var_locations(&nir->outputs, &nir->num_outputs, is_scalar);
+   } else {
+      nir_assign_var_locations(&nir->uniforms,
+                               &nir->num_uniforms,
+                               is_scalar);
+
+      foreach_list_typed(nir_variable, var, node, &nir->outputs)
+         var->data.driver_location = var->data.location;
+   }
+   nir_assign_var_locations(&nir->inputs, &nir->num_inputs, is_scalar);
+
+   nir_lower_io(nir, is_scalar);
 
-   nir_lower_io(nir);
    nir_validate_shader(nir);
 
    nir_remove_dead_variables(nir);
@@ -168,7 +165,7 @@ brw_process_nir(nir_shader *nir,
    nir_lower_atomics(nir);
    nir_validate_shader(nir);
 
-   nir_optimize(nir);
+   nir_optimize(nir, is_scalar);
 
    if (devinfo->gen >= 6) {
       /* Try and fuse multiply-adds */
@@ -201,9 +198,14 @@ brw_process_nir(nir_shader *nir,
       nir_print_shader(nir, stderr);
    }
 
-   nir_convert_from_ssa(nir);
+   nir_convert_from_ssa(nir, is_scalar);
    nir_validate_shader(nir);
 
+   if (!is_scalar) {
+      nir_lower_vec_to_movs(nir);
+      nir_validate_shader(nir);
+   }
+
    /* This is the last pass we run before we start emitting stuff.  It
     * determines when we need to insert boolean resolves on Gen <= 5.  We
     * run it last because it stashes data in instr->pass_flags and we don't
@@ -220,3 +222,42 @@ brw_process_nir(nir_shader *nir,
       nir_print_shader(nir, stderr);
    }
 }
+
+enum brw_reg_type
+brw_type_for_nir_type(nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_unsigned:
+      return BRW_REGISTER_TYPE_UD;
+   case nir_type_bool:
+   case nir_type_int:
+      return BRW_REGISTER_TYPE_D;
+   case nir_type_float:
+      return BRW_REGISTER_TYPE_F;
+   default:
+      unreachable("unknown type");
+   }
+
+   return BRW_REGISTER_TYPE_F;
+}
+
+/* Returns the glsl_base_type corresponding to a nir_alu_type.
+ * This is used by both brw_vec4_nir and brw_fs_nir.
+ */
+enum glsl_base_type
+brw_glsl_base_type_for_nir_type(nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_float:
+      return GLSL_TYPE_FLOAT;
+
+   case nir_type_int:
+      return GLSL_TYPE_INT;
+
+   case nir_type_unsigned:
+      return GLSL_TYPE_UINT;
+
+   default:
+      unreachable("bad type");
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 8487cef0901..5a1358890cc 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -24,6 +24,7 @@
 #pragma once
 
 #include "brw_context.h"
+#include "brw_reg.h"
 #include "glsl/nir/nir.h"
 
 #ifdef __cplusplus
@@ -77,13 +78,18 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir);
 nir_shader *brw_create_nir(struct brw_context *brw,
                            const struct gl_shader_program *shader_prog,
                            const struct gl_program *prog,
-                           gl_shader_stage stage);
+                           gl_shader_stage stage,
+                           bool is_scalar);
+
+enum brw_reg_type brw_type_for_nir_type(nir_alu_type type);
+
+enum glsl_base_type brw_glsl_base_type_for_nir_type(nir_alu_type type);
 
 void
 brw_process_nir(nir_shader *nir,
                 const struct brw_device_info *devinfo,
                 const struct gl_shader_program *shader_prog,
-                gl_shader_stage stage);
+                gl_shader_stage stage, bool is_scalar);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index f0b018cf84a..c995d2b7e2d 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -43,8 +43,8 @@
 static uint8_t
 get_resolve_status_for_src(nir_src *src)
 {
-   nir_instr *src_instr = nir_src_get_parent_instr(src);
-   if (src_instr) {
+   if (src->is_ssa) {
+      nir_instr *src_instr = src->ssa->parent_instr;
       uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
 
       /* If the source instruction needs resolve, then from the perspective
@@ -66,8 +66,8 @@ get_resolve_status_for_src(nir_src *src)
 static bool
 src_mark_needs_resolve(nir_src *src, void *void_state)
 {
-   nir_instr *src_instr = nir_src_get_parent_instr(src);
-   if (src_instr) {
+   if (src->is_ssa) {
+      nir_instr *src_instr = src->ssa->parent_instr;
       uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
 
       /* If the source instruction is unresolved, then mark it as needing
@@ -109,28 +109,27 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          uint8_t resolve_status;
          nir_alu_instr *alu = nir_instr_as_alu(instr);
          switch (alu->op) {
-         case nir_op_flt:
-         case nir_op_ilt:
-         case nir_op_ult:
-         case nir_op_fge:
-         case nir_op_ige:
-         case nir_op_uge:
-         case nir_op_feq:
-         case nir_op_ieq:
-         case nir_op_fne:
-         case nir_op_ine:
-         case nir_op_f2b:
-         case nir_op_i2b:
-            /* This instruction will turn into a CMP when we actually emit
-             * so the result will have to be resolved before it can be used.
+         case nir_op_bany2:
+         case nir_op_bany3:
+         case nir_op_bany4:
+         case nir_op_ball_fequal2:
+         case nir_op_ball_iequal2:
+         case nir_op_ball_fequal3:
+         case nir_op_ball_iequal3:
+         case nir_op_ball_fequal4:
+         case nir_op_ball_iequal4:
+         case nir_op_bany_fnequal2:
+         case nir_op_bany_inequal2:
+         case nir_op_bany_fnequal3:
+         case nir_op_bany_inequal3:
+         case nir_op_bany_fnequal4:
+         case nir_op_bany_inequal4:
+            /* These are only implemented by the vec4 backend and its
+             * implementation emits resolved booleans.  At some point in the
+             * future, this may change and we'll have to remove some of the
+             * above cases.
              */
-            resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
-
-            /* Even though the destination is allowed to be left unresolved,
-             * the sources are treated as regular integers or floats so
-             * they need to be resolved.
-             */
-            nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
             break;
 
          case nir_op_imov:
@@ -169,14 +168,28 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          }
 
          default:
-            resolve_status = BRW_NIR_NON_BOOLEAN;
+            if (nir_op_infos[alu->op].output_type == nir_type_bool) {
+               /* This instructions will turn into a CMP when we actually emit
+                * them so the result will have to be resolved before it can be
+                * used.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
+
+               /* Even though the destination is allowed to be left
+                * unresolved, the sources are treated as regular integers or
+                * floats so they need to be resolved.
+                */
+               nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            } else {
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            }
          }
 
-         /* If the destination is SSA-like, go ahead allow unresolved booleans.
+         /* If the destination is SSA, go ahead allow unresolved booleans.
           * If the destination register doesn't have a well-defined parent_instr
           * we need to resolve immediately.
           */
-         if (alu->dest.dest.reg.reg->parent_instr == NULL &&
+         if (!alu->dest.dest.is_ssa &&
              resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
             resolve_status = BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
          }
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
index 2c8cd491a8e..7e90e8a8fa1 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -581,7 +581,7 @@ snapshot_statistics_registers(struct brw_context *brw,
    const int group = PIPELINE_STATS_COUNTERS;
    const int num_counters = ctx->PerfMonitor.Groups[group].NumCounters;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    for (int i = 0; i < num_counters; i++) {
       if (BITSET_TEST(monitor->base.ActiveCounters[group], i)) {
@@ -687,7 +687,7 @@ stop_oa_counters(struct brw_context *brw)
  * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
  * including the required PIPE_CONTROL flushes.
  *
- * Sandybridge is the worst case scenario: intel_batchbuffer_emit_mi_flush
+ * Sandybridge is the worst case scenario: brw_emit_mi_flush
  * expands to three PIPE_CONTROLs which are 4 DWords each.  We have to flush
  * before and after MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add
  * the 3 DWords for MI_REPORT_PERF_COUNT itself.
@@ -710,10 +710,10 @@ emit_mi_report_perf_count(struct brw_context *brw,
    /* Make sure the commands to take a snapshot fits in a single batch. */
    intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
                                    RENDER_RING);
-   int batch_used = brw->batch.used;
+   int batch_used = USED_BATCH(brw->batch);
 
    /* Reports apparently don't always get written unless we flush first. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen == 5) {
       /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
@@ -751,10 +751,10 @@ emit_mi_report_perf_count(struct brw_context *brw,
    }
 
    /* Reports apparently don't always get written unless we flush after. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    (void) batch_used;
-   assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
+   assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
 }
 
 /**
@@ -1386,7 +1386,7 @@ void
 brw_perf_monitor_new_batch(struct brw_context *brw)
 {
    assert(brw->batch.ring == RENDER_RING);
-   assert(brw->gen < 6 || brw->batch.used == 0);
+   assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
 
    if (brw->perfmon.oa_users == 0)
       return;
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
new file mode 100644
index 00000000000..7ee3cb680f7
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_fbo.h"
+#include "intel_reg.h"
+
+/**
+ * According to the latest documentation, any PIPE_CONTROL with the
+ * "Command Streamer Stall" bit set must also have another bit set,
+ * with five different options:
+ *
+ *  - Render Target Cache Flush
+ *  - Depth Cache Flush
+ *  - Stall at Pixel Scoreboard
+ *  - Post-Sync Operation
+ *  - Depth Stall
+ *
+ * I chose "Stall at Pixel Scoreboard" since we've used it effectively
+ * in the past, but the choice is fairly arbitrary.
+ */
+static void
+gen8_add_cs_stall_workaround_bits(uint32_t *flags)
+{
+   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                      PIPE_CONTROL_WRITE_IMMEDIATE |
+                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                      PIPE_CONTROL_WRITE_TIMESTAMP |
+                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                      PIPE_CONTROL_DEPTH_STALL;
+
+   /* If we're doing a CS stall, and don't already have one of the
+    * workaround bits set, add "Stall at Pixel Scoreboard."
+    */
+   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
+      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+}
+
+/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+ *
+ * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+ *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ *
+ * Note that the kernel does CS stalls between batches, so we only need
+ * to count them within a batch.
+ */
+static uint32_t
+gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen == 7 && !brw->is_haswell) {
+      if (flags & PIPE_CONTROL_CS_STALL) {
+         /* If we're doing a CS stall, reset the counter and carry on. */
+         brw->pipe_controls_since_last_cs_stall = 0;
+         return 0;
+      }
+
+      /* If this is the fourth pipe control without a CS stall, do one now. */
+      if (++brw->pipe_controls_since_last_cs_stall == 4) {
+         brw->pipe_controls_since_last_cs_stall = 0;
+         return PIPE_CONTROL_CS_STALL;
+      }
+   }
+   return 0;
+}
+
+/**
+ * Emit a PIPE_CONTROL with various flushing flags.
+ *
+ * The caller is responsible for deciding what flags are appropriate for the
+ * given generation.
+ */
+void
+brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
+{
+   if (brw->gen >= 8) {
+      gen8_add_cs_stall_workaround_bits(&flags);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+}
+
+/**
+ * Emit a PIPE_CONTROL that writes to a buffer object.
+ *
+ * \p flags should contain one of the following items:
+ *  - PIPE_CONTROL_WRITE_IMMEDIATE
+ *  - PIPE_CONTROL_WRITE_TIMESTAMP
+ *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
+ */
+void
+brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
+                            drm_intel_bo *bo, uint32_t offset,
+                            uint32_t imm_lower, uint32_t imm_upper)
+{
+   if (brw->gen >= 8) {
+      gen8_add_cs_stall_workaround_bits(&flags);
+
+      BEGIN_BATCH(6);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+      OUT_BATCH(flags);
+      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   } else if (brw->gen >= 6) {
+      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+
+      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
+       * on later platforms.  We always use PPGTT on Gen7+.
+       */
+      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+      OUT_BATCH(flags);
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                gen6_gtt | offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
+      OUT_BATCH(imm_lower);
+      OUT_BATCH(imm_upper);
+      ADVANCE_BATCH();
+   }
+}
+
+/**
+ * Restriction [DevSNB, DevIVB]:
+ *
+ * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
+ * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
+ * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
+ * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
+ * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
+ * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
+ * unless SW can otherwise guarantee that the pipeline from WM onwards is
+ * already flushed (e.g., via a preceding MI_FLUSH).
+ */
+void
+brw_emit_depth_stall_flushes(struct brw_context *brw)
+{
+   assert(brw->gen >= 6 && brw->gen <= 9);
+
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
+}
+
+/**
+ * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
+ * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
+ *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
+ *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
+ *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
+ *  to be sent before any combination of VS associated 3DSTATE."
+ */
+void
+gen7_emit_vs_workaround_flush(struct brw_context *brw)
+{
+   assert(brw->gen == 7);
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_WRITE_IMMEDIATE
+                               | PIPE_CONTROL_DEPTH_STALL,
+                               brw->workaround_bo, 0,
+                               0, 0);
+}
+
+
+/**
+ * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
+ */
+void
+gen7_emit_cs_stall_flush(struct brw_context *brw)
+{
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_CS_STALL
+                               | PIPE_CONTROL_WRITE_IMMEDIATE,
+                               brw->workaround_bo, 0,
+                               0, 0);
+}
+
+
+/**
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6.  From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ *     "1 of the following must also be set:
+ *      - Render Target Cache Flush Enable ([12] of DW1)
+ *      - Depth Cache Flush Enable ([0] of DW1)
+ *      - Stall at Pixel Scoreboard ([1] of DW1)
+ *      - Depth Stall ([13] of DW1)
+ *      - Post-Sync Operation ([13] of DW1)
+ *      - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it.  Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either.  Notify enable is IRQs, which aren't
+ * really our business.  That leaves only stall at scoreboard.
+ */
+void
+brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
+{
+   brw_emit_pipe_control_flush(brw,
+                               PIPE_CONTROL_CS_STALL |
+                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
+
+   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
+                               brw->workaround_bo, 0, 0, 0);
+}
+
+/* Emit a pipelined flush to either flush render and texture cache for
+ * reading from a FBO-drawn texture, or flush so that frontbuffer
+ * render appears on the screen in DRI1.
+ *
+ * This is also used for the always_flush_cache driconf debug option.
+ */
+void
+brw_emit_mi_flush(struct brw_context *brw)
+{
+   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
+      BEGIN_BATCH_BLT(4);
+      OUT_BATCH(MI_FLUSH_DW);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
+      if (brw->gen >= 6) {
+         if (brw->gen == 9) {
+            /* Hardware workaround: SKL
+             *
+             * Emit Pipe Control with all bits set to zero before emitting
+             * a Pipe Control with VF Cache Invalidate set.
+             */
+            brw_emit_pipe_control_flush(brw, 0);
+         }
+
+         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
+                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                  PIPE_CONTROL_CS_STALL;
+
+         if (brw->gen == 6) {
+            /* Hardware workaround: SNB B-Spec says:
+             *
+             * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
+             * Flush Enable =1, a PIPE_CONTROL with any non-zero
+             * post-sync-op is required.
+             */
+            brw_emit_post_sync_nonzero_flush(brw);
+         }
+      }
+      brw_emit_pipe_control_flush(brw, flags);
+   }
+
+   brw_render_cache_set_clear(brw);
+}
+
+int
+brw_init_pipe_control(struct brw_context *brw,
+                      const struct brw_device_info *devinfo)
+{
+   if (devinfo->gen < 6)
+      return 0;
+
+   /* We can't just use brw_state_batch to get a chunk of space for
+    * the gen6 workaround because it involves actually writing to
+    * the buffer, and the kernel doesn't let us write to the batch.
+    */
+   brw->workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
+                                           "pipe_control workaround",
+                                           4096, 4096);
+   if (brw->workaround_bo == NULL)
+      return -ENOMEM;
+
+   brw->pipe_controls_since_last_cs_stall = 0;
+
+   return 0;
+}
+
+void
+brw_fini_pipe_control(struct brw_context *brw)
+{
+   drm_intel_bo_unreference(brw->workaround_bo);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_primitive_restart.c b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
index 2c7a7e8b8dd..6ed79d7cb75 100644
--- a/src/mesa/drivers/dri/i965/brw_primitive_restart.c
+++ b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
@@ -161,7 +161,8 @@ brw_handle_primitive_restart(struct gl_context *ctx,
       /* Cut index should work for primitive restart, so use it
        */
       brw->prim_restart.enable_cut_index = true;
-      brw_draw_prims(ctx, prims, nr_prims, ib, GL_FALSE, -1, -1, NULL, indirect);
+      brw_draw_prims(ctx, prims, nr_prims, ib, GL_FALSE, -1, -1, NULL, 0,
+                     indirect);
       brw->prim_restart.enable_cut_index = false;
    } else {
       /* Not all the primitive draw modes are supported by the cut index,
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index ea128ccb670..5a54cd39076 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -143,7 +143,7 @@ brwProgramStringNotify(struct gl_context *ctx,
       brw_add_texrect_params(prog);
 
       if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
-         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT);
+         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
       }
 
       brw_fs_precompile(ctx, NULL, prog);
@@ -169,7 +169,8 @@ brwProgramStringNotify(struct gl_context *ctx,
       brw_add_texrect_params(prog);
 
       if (ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
-         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX);
+         prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
+                                    brw->intelScreen->compiler->scalar_vs);
       }
 
       brw_vs_precompile(ctx, NULL, prog);
@@ -196,7 +197,7 @@ brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
    unsigned bits = (PIPE_CONTROL_DATA_CACHE_INVALIDATE |
                     PIPE_CONTROL_NO_WRITE |
                     PIPE_CONTROL_CS_STALL);
-   assert(brw->gen >= 7 && brw->gen <= 8);
+   assert(brw->gen >= 7 && brw->gen <= 9);
 
    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
                    GL_ELEMENT_ARRAY_BARRIER_BIT |
@@ -574,10 +575,13 @@ brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
             struct gl_shader *shader, struct gl_program *prog)
 {
    if (shader_prog) {
-      fprintf(stderr,
-              "GLSL IR for native %s shader %d:\n", stage, shader_prog->Name);
-      _mesa_print_ir(stderr, shader->ir, NULL);
-      fprintf(stderr, "\n\n");
+      if (shader->ir) {
+         fprintf(stderr,
+                 "GLSL IR for native %s shader %d:\n",
+                 stage, shader_prog->Name);
+         _mesa_print_ir(stderr, shader->ir, NULL);
+         fprintf(stderr, "\n\n");
+      }
    } else {
       fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
               stage, prog->Id, stage);
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index aea4d9b77d3..d6b012c392e 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -497,13 +497,22 @@ brw_get_timestamp(struct gl_context *ctx)
    struct brw_context *brw = brw_context(ctx);
    uint64_t result = 0;
 
-   drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+   switch (brw->intelScreen->hw_has_timestamp) {
+   case 3: /* New kernel, always full 36bit accuracy */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP | 1, &result);
+      break;
+   case 2: /* 64bit kernel, result is left-shifted by 32bits, losing 4bits */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+      result = result >> 32;
+      break;
+   case 1: /* 32bit kernel, result is 36bit wide but may be inaccurate! */
+      drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
+      break;
+   }
 
    /* See logic in brw_queryobj_get_results() */
-   result = result >> 32;
    result *= 80;
    result &= (1ull << 36) - 1;
-
    return result;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index c8b134103bb..31806f769bd 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -853,7 +853,7 @@ static inline struct brw_reg
 spread(struct brw_reg reg, unsigned s)
 {
    if (s) {
-      assert(is_power_of_two(s));
+      assert(_mesa_is_pow_two(s));
 
       if (reg.hstride)
          reg.hstride += cvt(s) - 1;
@@ -950,6 +950,12 @@ brw_set_writemask(struct brw_reg reg, unsigned mask)
    return reg;
 }
 
+static inline unsigned
+brw_writemask_for_size(unsigned n)
+{
+   return (1 << n) - 1;
+}
+
 static inline struct brw_reg
 negate(struct brw_reg reg)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 22ccbfe8461..2021bb3b460 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -425,11 +425,11 @@ brw_update_sampler_state(struct brw_context *brw,
 
    /* Enable anisotropic filtering if desired. */
    unsigned max_anisotropy = BRW_ANISORATIO_2;
-   if (sampler->MaxAnisotropy > 1.0) {
+   if (sampler->MaxAnisotropy > 1.0f) {
       min_filter = BRW_MAPFILTER_ANISOTROPIC;
       mag_filter = BRW_MAPFILTER_ANISOTROPIC;
 
-      if (sampler->MaxAnisotropy > 2.0) {
+      if (sampler->MaxAnisotropy > 2.0f) {
 	 max_anisotropy =
             MIN2((sampler->MaxAnisotropy - 2) / 2, BRW_ANISORATIO_16);
       }
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index ee0add5d765..b49961fff68 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -1314,8 +1314,8 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
                 * single-result send is probably actually reducing register
                 * pressure.
                 */
-               if (inst->regs_written <= inst->dst.width / 8 &&
-                   chosen_inst->regs_written > chosen_inst->dst.width / 8) {
+               if (inst->regs_written <= inst->exec_size / 8 &&
+                   chosen_inst->regs_written > chosen_inst->exec_size / 8) {
                   chosen = n;
                   continue;
                } else if (inst->regs_written > chosen_inst->regs_written) {
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 5d9892214a9..b126f82ebbf 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -45,7 +45,7 @@ static void upload_sf_vp(struct brw_context *brw)
    struct gl_context *ctx = &brw->ctx;
    struct brw_sf_viewport *sfv;
    GLfloat y_scale, y_bias;
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
    sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
@@ -220,7 +220,7 @@ static void upload_sf_unit( struct brw_context *brw )
 
    /* _NEW_LINE */
    sf->sf6.line_width =
-      CLAMP(ctx->Line.Width, 1.0, ctx->Const.MaxLineWidth) * (1<<1);
+      CLAMP(ctx->Line.Width, 1.0f, ctx->Const.MaxLineWidth) * (1<<1);
 
    sf->sf6.line_endcap_aa_region_width = 1;
    if (ctx->Line.SmoothFlag)
@@ -259,9 +259,10 @@ static void upload_sf_unit( struct brw_context *brw )
 
    /* _NEW_POINT */
    sf->sf7.sprite_point = ctx->Point.PointSprite;
-   sf->sf7.point_size = CLAMP(rint(CLAMP(ctx->Point.Size,
-					 ctx->Point.MinSize,
-					 ctx->Point.MaxSize)), 1, 255) * (1<<3);
+   sf->sf7.point_size = CLAMP(rintf(CLAMP(ctx->Point.Size,
+                                          ctx->Point.MinSize,
+                                          ctx->Point.MaxSize)), 1.0f, 255.0f) *
+                        (1<<3);
    /* _NEW_PROGRAM | _NEW_POINT */
    sf->sf7.use_point_size_state = !(ctx->VertexProgram.PointSizeEnabled ||
 				    ctx->Point._Attenuated);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 06393c8ff2b..67b8dde7cc8 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -113,22 +113,32 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 	 (i == MESA_SHADER_FRAGMENT);
       compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
       compiler->glsl_compiler_options[i].LowerClipDistance = true;
+
+      /* !ARB_gpu_shader5 */
+      if (devinfo->gen < 7)
+         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
    }
 
    compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = true;
    compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
 
-   if (compiler->scalar_vs) {
-      /* If we're using the scalar backend for vertex shaders, we need to
-       * configure these accordingly.
-       */
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
-      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
+   if (compiler->scalar_vs || brw_env_var_as_boolean("INTEL_USE_NIR", false)) {
+      if (compiler->scalar_vs) {
+         /* If we're using the scalar backend for vertex shaders, we need to
+          * configure these accordingly.
+          */
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
+         compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
+      }
 
       compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions = nir_options;
    }
 
+   if (brw_env_var_as_boolean("INTEL_USE_NIR", false)) {
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].NirOptions = nir_options;
+   }
+
    compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions = nir_options;
    compiler->glsl_compiler_options[MESA_SHADER_COMPUTE].NirOptions = nir_options;
 
@@ -229,7 +239,8 @@ brw_lower_packing_builtins(struct brw_context *brw,
 }
 
 static void
-process_glsl_ir(struct brw_context *brw,
+process_glsl_ir(gl_shader_stage stage,
+                struct brw_context *brw,
                 struct gl_shader_program *shader_prog,
                 struct gl_shader *shader)
 {
@@ -255,7 +266,9 @@ process_glsl_ir(struct brw_context *brw,
                       EXP_TO_EXP2 |
                       LOG_TO_LOG2 |
                       bitfield_insert |
-                      LDEXP_TO_ARITH);
+                      LDEXP_TO_ARITH |
+                      CARRY_TO_ARITH |
+                      BORROW_TO_ARITH);
 
    /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
     * if-statements need to be flattened.
@@ -275,15 +288,17 @@ process_glsl_ir(struct brw_context *brw,
    lower_quadop_vector(shader->ir, false);
 
    bool lowered_variable_indexing =
-      lower_variable_index_to_cond_assign(shader->ir,
+      lower_variable_index_to_cond_assign((gl_shader_stage)stage,
+                                          shader->ir,
                                           options->EmitNoIndirectInput,
                                           options->EmitNoIndirectOutput,
                                           options->EmitNoIndirectTemp,
                                           options->EmitNoIndirectUniform);
 
    if (unlikely(brw->perf_debug && lowered_variable_indexing)) {
-      perf_debug("Unsupported form of variable indexing in FS; falling "
-                 "back to very inefficient code generation\n");
+      perf_debug("Unsupported form of variable indexing in %s; falling "
+                 "back to very inefficient code generation\n",
+                 _mesa_shader_stage_to_abbrev(shader->Stage));
    }
 
    lower_ubo_reference(shader, shader->ir);
@@ -308,7 +323,7 @@ process_glsl_ir(struct brw_context *brw,
    } while (progress);
 
    if (options->NirOptions != NULL)
-      lower_output_reads(shader->ir);
+      lower_output_reads(stage, shader->ir);
 
    validate_ir_tree(shader->ir);
 
@@ -352,7 +367,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       _mesa_copy_linked_program_data((gl_shader_stage) stage, shProg, prog);
 
-      process_glsl_ir(brw, shProg, shader);
+      process_glsl_ir((gl_shader_stage) stage, brw, shProg, shader);
 
       /* Make a pass over the IR to add state references for any built-in
        * uniforms that are used.  This has to be done now (during linking).
@@ -387,8 +402,10 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 
       brw_add_texrect_params(prog);
 
-      if (options->NirOptions)
-         prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage);
+      if (options->NirOptions) {
+         prog->nir = brw_create_nir(brw, shProg, prog, (gl_shader_stage) stage,
+                                    is_scalar_shader_stage(brw, stage));
+      }
 
       _mesa_reference_program(ctx, &prog, NULL);
    }
@@ -422,6 +439,7 @@ brw_type_for_base_type(const struct glsl_type *type)
       return BRW_REGISTER_TYPE_F;
    case GLSL_TYPE_INT:
    case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_SUBROUTINE:
       return BRW_REGISTER_TYPE_D;
    case GLSL_TYPE_UINT:
       return BRW_REGISTER_TYPE_UD;
@@ -528,6 +546,8 @@ brw_instruction_name(enum opcode op)
       return opcode_descs[op].name;
    case FS_OPCODE_FB_WRITE:
       return "fb_write";
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      return "fb_write_logical";
    case FS_OPCODE_BLORP_FB_WRITE:
       return "blorp_fb_write";
    case FS_OPCODE_REP_FB_WRITE:
@@ -556,43 +576,80 @@ brw_instruction_name(enum opcode op)
 
    case SHADER_OPCODE_TEX:
       return "tex";
+   case SHADER_OPCODE_TEX_LOGICAL:
+      return "tex_logical";
    case SHADER_OPCODE_TXD:
       return "txd";
+   case SHADER_OPCODE_TXD_LOGICAL:
+      return "txd_logical";
    case SHADER_OPCODE_TXF:
       return "txf";
+   case SHADER_OPCODE_TXF_LOGICAL:
+      return "txf_logical";
    case SHADER_OPCODE_TXL:
       return "txl";
+   case SHADER_OPCODE_TXL_LOGICAL:
+      return "txl_logical";
    case SHADER_OPCODE_TXS:
       return "txs";
+   case SHADER_OPCODE_TXS_LOGICAL:
+      return "txs_logical";
    case FS_OPCODE_TXB:
       return "txb";
+   case FS_OPCODE_TXB_LOGICAL:
+      return "txb_logical";
    case SHADER_OPCODE_TXF_CMS:
       return "txf_cms";
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+      return "txf_cms_logical";
    case SHADER_OPCODE_TXF_UMS:
       return "txf_ums";
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+      return "txf_ums_logical";
    case SHADER_OPCODE_TXF_MCS:
       return "txf_mcs";
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+      return "txf_mcs_logical";
    case SHADER_OPCODE_LOD:
       return "lod";
+   case SHADER_OPCODE_LOD_LOGICAL:
+      return "lod_logical";
    case SHADER_OPCODE_TG4:
       return "tg4";
+   case SHADER_OPCODE_TG4_LOGICAL:
+      return "tg4_logical";
    case SHADER_OPCODE_TG4_OFFSET:
       return "tg4_offset";
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return "tg4_offset_logical";
+
    case SHADER_OPCODE_SHADER_TIME_ADD:
       return "shader_time_add";
 
    case SHADER_OPCODE_UNTYPED_ATOMIC:
       return "untyped_atomic";
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      return "untyped_atomic_logical";
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
       return "untyped_surface_read";
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      return "untyped_surface_read_logical";
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
       return "untyped_surface_write";
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      return "untyped_surface_write_logical";
    case SHADER_OPCODE_TYPED_ATOMIC:
       return "typed_atomic";
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      return "typed_atomic_logical";
    case SHADER_OPCODE_TYPED_SURFACE_READ:
       return "typed_surface_read";
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      return "typed_surface_read_logical";
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
       return "typed_surface_write";
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return "typed_surface_write_logical";
    case SHADER_OPCODE_MEMORY_FENCE:
       return "memory_fence";
 
@@ -653,8 +710,6 @@ brw_instruction_name(enum opcode op)
    case FS_OPCODE_DISCARD_JUMP:
       return "discard_jump";
 
-   case FS_OPCODE_SET_OMASK:
-      return "set_omask";
    case FS_OPCODE_SET_SAMPLE_ID:
       return "set_sample_id";
    case FS_OPCODE_SET_SIMD4X2_OFFSET:
@@ -724,6 +779,8 @@ brw_instruction_name(enum opcode op)
       return "cs_terminate";
    case SHADER_OPCODE_BARRIER:
       return "barrier";
+   case SHADER_OPCODE_MULH:
+      return "mulh";
    }
 
    unreachable("not reached");
@@ -942,6 +999,7 @@ backend_instruction::is_commutative() const
    case BRW_OPCODE_XOR:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
       return true;
    case BRW_OPCODE_SEL:
       /* MIN and MAX are commutative. */
@@ -1049,6 +1107,7 @@ backend_instruction::can_do_saturate() const
    case BRW_OPCODE_MATH:
    case BRW_OPCODE_MOV:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_PLN:
    case BRW_OPCODE_RNDD:
    case BRW_OPCODE_RNDE:
@@ -1147,10 +1206,14 @@ backend_instruction::has_side_effects() const
 {
    switch (opcode) {
    case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_MEMORY_FENCE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
    case FS_OPCODE_FB_WRITE:
@@ -1356,3 +1419,34 @@ backend_shader::assign_common_binding_table_offsets(uint32_t next_binding_table_
 
    /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
 }
+
+void
+backend_shader::setup_image_uniform_values(const gl_uniform_storage *storage)
+{
+   const unsigned stage = _mesa_program_enum_to_shader_stage(prog->Target);
+
+   for (unsigned i = 0; i < MAX2(storage->array_elements, 1); i++) {
+      const unsigned image_idx = storage->image[stage].index + i;
+      const brw_image_param *param = &stage_prog_data->image_param[image_idx];
+
+      /* Upload the brw_image_param structure.  The order is expected to match
+       * the BRW_IMAGE_PARAM_*_OFFSET defines.
+       */
+      setup_vector_uniform_values(
+         (const gl_constant_value *)&param->surface_idx, 1);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->offset, 2);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->size, 3);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->stride, 4);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->tiling, 3);
+      setup_vector_uniform_values(
+         (const gl_constant_value *)param->swizzling, 2);
+
+      brw_mark_surface_used(
+         stage_prog_data,
+         stage_prog_data->binding_table.image_start + image_idx);
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index b2c1a0b8d69..2cc97f24972 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "main/compiler.h"
 #include "glsl/ir.h"
+#include "program/prog_parameter.h"
 
 #ifdef __cplusplus
 #include "brw_ir_allocator.h"
@@ -268,6 +269,10 @@ public:
    void assign_common_binding_table_offsets(uint32_t next_binding_table_offset);
 
    virtual void invalidate_live_intervals() = 0;
+
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n) = 0;
+   void setup_image_uniform_values(const gl_uniform_storage *storage);
 };
 
 uint32_t brw_texture_offset(int *offsets, unsigned num_components);
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 987672f8815..78a1f874b4e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -72,8 +72,10 @@ extern const struct brw_tracked_state brw_vs_samplers;
 extern const struct brw_tracked_state brw_gs_samplers;
 extern const struct brw_tracked_state brw_vs_ubo_surfaces;
 extern const struct brw_tracked_state brw_vs_abo_surfaces;
+extern const struct brw_tracked_state brw_vs_image_surfaces;
 extern const struct brw_tracked_state brw_gs_ubo_surfaces;
 extern const struct brw_tracked_state brw_gs_abo_surfaces;
+extern const struct brw_tracked_state brw_gs_image_surfaces;
 extern const struct brw_tracked_state brw_vs_unit;
 extern const struct brw_tracked_state brw_gs_prog;
 extern const struct brw_tracked_state brw_wm_prog;
@@ -84,7 +86,9 @@ extern const struct brw_tracked_state brw_gs_binding_table;
 extern const struct brw_tracked_state brw_vs_binding_table;
 extern const struct brw_tracked_state brw_wm_ubo_surfaces;
 extern const struct brw_tracked_state brw_wm_abo_surfaces;
+extern const struct brw_tracked_state brw_wm_image_surfaces;
 extern const struct brw_tracked_state brw_cs_abo_surfaces;
+extern const struct brw_tracked_state brw_cs_image_surfaces;
 extern const struct brw_tracked_state brw_wm_unit;
 extern const struct brw_tracked_state brw_interpolation_map;
 
@@ -121,7 +125,6 @@ extern const struct brw_tracked_state gen6_wm_state;
 extern const struct brw_tracked_state gen7_depthbuffer;
 extern const struct brw_tracked_state gen7_clip_state;
 extern const struct brw_tracked_state gen7_disable_stages;
-extern const struct brw_tracked_state gen7_gs_push_constants;
 extern const struct brw_tracked_state gen7_gs_state;
 extern const struct brw_tracked_state gen7_ps_state;
 extern const struct brw_tracked_state gen7_push_constant_space;
@@ -132,6 +135,7 @@ extern const struct brw_tracked_state gen7_sol_state;
 extern const struct brw_tracked_state gen7_urb;
 extern const struct brw_tracked_state gen7_vs_state;
 extern const struct brw_tracked_state gen7_wm_state;
+extern const struct brw_tracked_state gen7_hw_binding_tables;
 extern const struct brw_tracked_state haswell_cut_index;
 extern const struct brw_tracked_state gen8_blend_state;
 extern const struct brw_tracked_state gen8_disable_stages;
@@ -266,15 +270,6 @@ void brw_update_renderbuffer_surfaces(struct brw_context *brw,
                                       uint32_t render_target_start,
                                       uint32_t *surf_offset);
 
-/* gen7_wm_state.c */
-void
-gen7_upload_ps_state(struct brw_context *brw,
-                     const struct gl_fragment_program *fp,
-                     const struct brw_stage_state *stage_state,
-                     const struct brw_wm_prog_data *prog_data,
-                     bool enable_dual_src_blend, unsigned sample_mask,
-                     unsigned fast_clear_op);
-
 /* gen7_wm_surface_state.c */
 uint32_t gen7_surface_tiling_mode(uint32_t tiling);
 uint32_t gen7_surface_msaa_bits(unsigned num_samples, enum intel_msaa_layout l);
@@ -372,6 +367,20 @@ gen7_upload_constant_state(struct brw_context *brw,
                            const struct brw_stage_state *stage_state,
                            bool active, unsigned opcode);
 
+void gen7_rs_control(struct brw_context *brw, int enable);
+
+void gen7_edit_hw_binding_table_entry(struct brw_context *brw,
+                                      gl_shader_stage stage,
+                                      uint32_t index,
+                                      uint32_t surf_offset);
+void gen7_update_binding_table_from_array(struct brw_context *brw,
+                                          gl_shader_stage stage,
+                                          const uint32_t* binding_table,
+                                          int num_surfaces);
+void gen7_enable_hw_binding_tables(struct brw_context *brw);
+void gen7_disable_hw_binding_tables(struct brw_context *brw);
+void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index a405a80ef6e..d79e0ea00c7 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -87,7 +87,7 @@ brw_annotate_aub(struct brw_context *brw)
    drm_intel_aub_annotation annotations[annotation_count];
    int a = 0;
    make_annotation(&annotations[a++], AUB_TRACE_TYPE_BATCH, 0,
-                   4*brw->batch.used);
+                   4 * USED_BATCH(brw->batch));
    for (int i = brw->state_batch_count; i-- > 0; ) {
       uint32_t type = brw->state_batch_list[i].type;
       uint32_t start_offset = brw->state_batch_list[i].offset;
@@ -136,7 +136,7 @@ __brw_state_batch(struct brw_context *brw,
     * space, then flush and try again.
     */
    if (batch->state_batch_offset < size ||
-       offset < 4*batch->used + batch->reserved_space) {
+       offset < 4 * USED_BATCH(*batch) + batch->reserved_space) {
       intel_batchbuffer_flush(brw);
       offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 24778d25379..5effb4c8829 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -200,36 +200,23 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 }
 
 /**
- * Attempts to find an item in the cache with identical data and aux
- * data to use
+ * Attempts to find an item in the cache with identical data.
  */
-static bool
-brw_try_upload_using_copy(struct brw_cache *cache,
-			  struct brw_cache_item *result_item,
-			  const void *data,
-			  const void *aux)
+static const struct brw_cache_item *
+brw_lookup_prog(const struct brw_cache *cache,
+                enum brw_cache_id cache_id,
+                const void *data, unsigned data_size)
 {
-   struct brw_context *brw = cache->brw;
+   const struct brw_context *brw = cache->brw;
    int i;
-   struct brw_cache_item *item;
+   const struct brw_cache_item *item;
 
    for (i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
-	 const void *item_aux = item->key + item->key_size;
 	 int ret;
 
-	 if (item->cache_id != result_item->cache_id ||
-	     item->size != result_item->size ||
-	     item->aux_size != result_item->aux_size) {
+	 if (item->cache_id != cache_id || item->size != data_size)
 	    continue;
-	 }
-
-         if (cache->aux_compare[result_item->cache_id]) {
-            if (!cache->aux_compare[result_item->cache_id](item_aux, aux))
-               continue;
-         } else if (memcmp(item_aux, aux, item->aux_size) != 0) {
-	    continue;
-	 }
 
          if (!brw->has_llc)
             drm_intel_bo_map(cache->bo, false);
@@ -239,27 +226,24 @@ brw_try_upload_using_copy(struct brw_cache *cache,
 	 if (ret)
 	    continue;
 
-	 result_item->offset = item->offset;
-
-	 return true;
+	 return item;
       }
    }
 
-   return false;
+   return NULL;
 }
 
-static void
-brw_upload_item_data(struct brw_cache *cache,
-		     struct brw_cache_item *item,
-		     const void *data)
+static uint32_t
+brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
 {
+   uint32_t offset;
    struct brw_context *brw = cache->brw;
 
    /* Allocate space in the cache BO for our new program. */
-   if (cache->next_offset + item->size > cache->bo->size) {
+   if (cache->next_offset + size > cache->bo->size) {
       uint32_t new_size = cache->bo->size * 2;
 
-      while (cache->next_offset + item->size > new_size)
+      while (cache->next_offset + size > new_size)
 	 new_size *= 2;
 
       brw_cache_new_bo(cache, new_size);
@@ -273,10 +257,12 @@ brw_upload_item_data(struct brw_cache *cache,
       brw_cache_new_bo(cache, cache->bo->size);
    }
 
-   item->offset = cache->next_offset;
+   offset = cache->next_offset;
 
    /* Programs are always 64-byte aligned, so set up the next one now */
-   cache->next_offset = ALIGN(item->offset + item->size, 64);
+   cache->next_offset = ALIGN(offset + size, 64);
+
+   return offset;
 }
 
 void
@@ -293,6 +279,8 @@ brw_upload_cache(struct brw_cache *cache,
 {
    struct brw_context *brw = cache->brw;
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
+   const struct brw_cache_item *matching_data =
+      brw_lookup_prog(cache, cache_id, data, data_size);
    GLuint hash;
    void *tmp;
 
@@ -304,15 +292,23 @@ brw_upload_cache(struct brw_cache *cache,
    hash = hash_key(item);
    item->hash = hash;
 
-   /* If we can find a matching prog/prog_data combo in the cache
-    * already, then reuse the existing stuff.  This will mean not
-    * flagging CACHE_NEW_* when transitioning between the two
-    * equivalent hash keys.  This is notably useful for programs
-    * generating shaders at runtime, where multiple shaders may
-    * compile to the thing in our backend.
+   /* If we can find a matching prog in the cache already, then reuse the
+    * existing stuff without creating new copy into the underlying buffer
+    * object. This is notably useful for programs generating shaders at
+    * runtime, where multiple shaders may compile to the same thing in our
+    * backend.
     */
-   if (!brw_try_upload_using_copy(cache, item, data, aux)) {
-      brw_upload_item_data(cache, item, data);
+   if (matching_data) {
+      item->offset = matching_data->offset;
+   } else {
+      item->offset = brw_alloc_item_data(cache, data_size);
+
+      /* Copy data to the buffer */
+      if (brw->has_llc) {
+         memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
+      } else {
+         drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
+      }
    }
 
    /* Set up the memory containing the key and aux_data */
@@ -323,7 +319,7 @@ brw_upload_cache(struct brw_cache *cache,
 
    item->key = tmp;
 
-   if (cache->n_items > cache->size * 1.5)
+   if (cache->n_items > cache->size * 1.5f)
       rehash(cache);
 
    hash %= cache->size;
@@ -331,13 +327,6 @@ brw_upload_cache(struct brw_cache *cache,
    cache->items[hash] = item;
    cache->n_items++;
 
-   /* Copy data to the buffer */
-   if (brw->has_llc) {
-      memcpy((char *) cache->bo->virtual + item->offset, data, data_size);
-   } else {
-      drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
-   }
-
    *out_offset = item->offset;
    *(void **)out_aux = (void *)((char *)item->key + item->key_size);
    cache->brw->ctx.NewDriverState |= 1 << cache_id;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 08d1ac28885..9de42ce8503 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -192,6 +192,12 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
    &gen6_color_calc_state,	/* must do before cc unit */
    &gen6_depth_stencil_state,	/* must do before cc unit */
 
+   &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Haswell */
+
+   &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+   &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
+   &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
+
    &gen6_vs_push_constants, /* Before vs_state */
    &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
@@ -251,6 +257,7 @@ static const struct brw_tracked_state *gen7_render_atoms[] =
 static const struct brw_tracked_state *gen7_compute_atoms[] =
 {
    &brw_state_base_address,
+   &brw_cs_image_surfaces,
    &brw_cs_abo_surfaces,
    &brw_cs_state,
 };
@@ -268,6 +275,12 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
    &gen8_blend_state,
    &gen6_color_calc_state,
 
+   &gen7_hw_binding_tables, /* Enable hw-generated binding tables for Broadwell */
+
+   &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
+   &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
+   &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
+
    &gen6_vs_push_constants, /* Before vs_state */
    &gen6_gs_push_constants, /* Before gs_state */
    &gen6_wm_push_constants, /* Before wm_surfaces and constant_buffer */
@@ -334,6 +347,7 @@ static const struct brw_tracked_state *gen8_render_atoms[] =
 static const struct brw_tracked_state *gen8_compute_atoms[] =
 {
    &gen8_state_base_address,
+   &brw_cs_image_surfaces,
    &brw_cs_abo_surfaces,
    &brw_cs_state,
 };
@@ -349,7 +363,7 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
       return;
 
    if (brw->gen == 6)
-      intel_emit_post_sync_nonzero_flush(brw);
+      brw_emit_post_sync_nonzero_flush(brw);
 
    brw_upload_invariant_state(brw);
 
@@ -468,6 +482,7 @@ void brw_init_state( struct brw_context *brw )
    ctx->DriverFlags.NewUniformBuffer = BRW_NEW_UNIFORM_BUFFER;
    ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER;
    ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER;
+   ctx->DriverFlags.NewImageUnits = BRW_NEW_IMAGE_UNITS;
 }
 
 
@@ -581,6 +596,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_STATS_WM),
    DEFINE_BIT(BRW_NEW_UNIFORM_BUFFER),
    DEFINE_BIT(BRW_NEW_ATOMIC_BUFFER),
+   DEFINE_BIT(BRW_NEW_IMAGE_UNITS),
    DEFINE_BIT(BRW_NEW_META_IN_PROGRESS),
    DEFINE_BIT(BRW_NEW_INTERPOLATION_MAP),
    DEFINE_BIT(BRW_NEW_PUSH_CONSTANT_ALLOCATION),
@@ -710,7 +726,7 @@ brw_upload_pipeline_state(struct brw_context *brw,
 
    /* Emit Sandybridge workaround flushes on every primitive, for safety. */
    if (brw->gen == 6)
-      intel_emit_post_sync_nonzero_flush(brw);
+      brw_emit_post_sync_nonzero_flush(brw);
 
    brw_upload_programs(brw, pipeline);
    merge_ctx_state(brw, &state);
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index 05016067bba..a33fd88a026 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -813,3 +813,112 @@ brw_depth_format(struct brw_context *brw, mesa_format format)
       unreachable("Unexpected depth format.");
    }
 }
+
+mesa_format
+brw_lower_mesa_image_format(const struct brw_device_info *devinfo,
+                            mesa_format format)
+{
+   switch (format) {
+   /* These are never lowered.  Up to BDW we'll have to fall back to untyped
+    * surface access for 128bpp formats.
+    */
+   case MESA_FORMAT_RGBA_UINT32:
+   case MESA_FORMAT_RGBA_SINT32:
+   case MESA_FORMAT_RGBA_FLOAT32:
+   case MESA_FORMAT_R_UINT32:
+   case MESA_FORMAT_R_SINT32:
+   case MESA_FORMAT_R_FLOAT32:
+      return format;
+
+   /* From HSW to BDW the only 64bpp format supported for typed access is
+    * RGBA_UINT16.  IVB falls back to untyped.
+    */
+   case MESA_FORMAT_RGBA_UINT16:
+   case MESA_FORMAT_RGBA_SINT16:
+   case MESA_FORMAT_RGBA_FLOAT16:
+   case MESA_FORMAT_RG_UINT32:
+   case MESA_FORMAT_RG_SINT32:
+   case MESA_FORMAT_RG_FLOAT32:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT16 : MESA_FORMAT_RG_UINT32);
+
+   /* Up to BDW no SINT or FLOAT formats of less than 32 bits per component
+    * are supported.  IVB doesn't support formats with more than one component
+    * for typed access.  For 8 and 16 bpp formats IVB relies on the
+    * undocumented behavior that typed reads from R_UINT8 and R_UINT16
+    * surfaces actually do a 32-bit misaligned read.  The alternative would be
+    * to use two surface state entries with different formats for each image,
+    * one for reading (using R_UINT32) and another one for writing (using
+    * R_UINT8 or R_UINT16), but that would complicate the shaders we generate
+    * even more.
+    */
+   case MESA_FORMAT_RGBA_UINT8:
+   case MESA_FORMAT_RGBA_SINT8:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT8 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_RG_UINT16:
+   case MESA_FORMAT_RG_SINT16:
+   case MESA_FORMAT_RG_FLOAT16:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT16 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_RG_UINT8:
+   case MESA_FORMAT_RG_SINT8:
+      return (devinfo->gen >= 9 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT8 : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UINT16:
+   case MESA_FORMAT_R_FLOAT16:
+   case MESA_FORMAT_R_SINT16:
+      return (devinfo->gen >= 9 ? format : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UINT8:
+   case MESA_FORMAT_R_SINT8:
+      return (devinfo->gen >= 9 ? format : MESA_FORMAT_R_UINT8);
+
+   /* Neither the 2/10/10/10 nor the 11/11/10 packed formats are supported
+    * by the hardware.
+    */
+   case MESA_FORMAT_R10G10B10A2_UINT:
+   case MESA_FORMAT_R10G10B10A2_UNORM:
+   case MESA_FORMAT_R11G11B10_FLOAT:
+      return MESA_FORMAT_R_UINT32;
+
+   /* No normalized fixed-point formats are supported by the hardware. */
+   case MESA_FORMAT_RGBA_UNORM16:
+   case MESA_FORMAT_RGBA_SNORM16:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT16 : MESA_FORMAT_RG_UINT32);
+
+   case MESA_FORMAT_R8G8B8A8_UNORM:
+   case MESA_FORMAT_R8G8B8A8_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RGBA_UINT8 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_R16G16_UNORM:
+   case MESA_FORMAT_R16G16_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT16 : MESA_FORMAT_R_UINT32);
+
+   case MESA_FORMAT_R8G8_UNORM:
+   case MESA_FORMAT_R8G8_SNORM:
+      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+              MESA_FORMAT_RG_UINT8 : MESA_FORMAT_R_UINT16);
+
+   case MESA_FORMAT_R_UNORM16:
+   case MESA_FORMAT_R_SNORM16:
+      return MESA_FORMAT_R_UINT16;
+
+   case MESA_FORMAT_R_UNORM8:
+   case MESA_FORMAT_R_SNORM8:
+      return MESA_FORMAT_R_UINT8;
+
+   default:
+      unreachable("Unknown image format");
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 998d8c42770..b8b03932065 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -63,7 +63,7 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    int i = 0;
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
 
    switch(mt->target) {
    case GL_TEXTURE_1D:
@@ -95,7 +95,7 @@ tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
    ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
                align_yf[i] : align_ys[i];
 
-   assert(is_power_of_two(mt->num_samples));
+   assert(_mesa_is_pow_two(mt->num_samples));
 
    switch (mt->num_samples) {
    case 2:
@@ -199,7 +199,7 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
           mt->target != GL_TEXTURE_1D_ARRAY);
 
    /* Alignment computations below assume bpp >= 8 and a power of 2. */
-   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp)) ;
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp)) ;
 
    switch(mt->target) {
    case GL_TEXTURE_2D:
@@ -226,7 +226,7 @@ tr_mode_vertical_texture_alignment(const struct brw_context *brw,
    ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
                align_yf[i] : align_ys[i];
 
-   assert(is_power_of_two(mt->num_samples));
+   assert(_mesa_is_pow_two(mt->num_samples));
 
    switch (mt->num_samples) {
    case 4:
@@ -366,9 +366,8 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
 
    mt->total_width = mt->physical_width0;
 
-   if (mt->compressed) {
-       mt->total_width = ALIGN(mt->physical_width0, mt->align_w);
-   }
+   if (mt->compressed)
+       mt->total_width = ALIGN(mt->total_width, bw);
 
    /* May need to adjust width to accommodate the placement of
     * the 2nd mipmap.  This occurs when the alignment
@@ -433,9 +432,7 @@ brw_miptree_get_horizontal_slice_pitch(const struct brw_context *brw,
                                        const struct intel_mipmap_tree *mt,
                                        unsigned level)
 {
-   assert(brw->gen < 9);
-
-   if (mt->target == GL_TEXTURE_3D ||
+   if ((brw->gen < 9 && mt->target == GL_TEXTURE_3D) ||
        (brw->gen == 4 && mt->target == GL_TEXTURE_CUBE_MAP)) {
       return ALIGN(minify(mt->physical_width0, level), mt->align_w);
    } else {
@@ -615,8 +612,8 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
  */
 static uint32_t
 brw_miptree_choose_tiling(struct brw_context *brw,
-                          enum intel_miptree_tiling_mode requested,
-                          const struct intel_mipmap_tree *mt)
+                          const struct intel_mipmap_tree *mt,
+                          uint32_t layout_flags)
 {
    if (mt->format == MESA_FORMAT_S_UINT8) {
       /* The stencil buffer is W tiled. However, we request from the kernel a
@@ -625,15 +622,18 @@ brw_miptree_choose_tiling(struct brw_context *brw,
       return I915_TILING_NONE;
    }
 
+   /* Do not support changing the tiling for miptrees with pre-allocated BOs. */
+   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
+
    /* Some usages may want only one type of tiling, like depth miptrees (Y
     * tiled), or temporary BOs for uploading data once (linear).
     */
-   switch (requested) {
-   case INTEL_MIPTREE_TILING_ANY:
+   switch (layout_flags & MIPTREE_LAYOUT_TILING_ANY) {
+   case MIPTREE_LAYOUT_TILING_ANY:
       break;
-   case INTEL_MIPTREE_TILING_Y:
+   case MIPTREE_LAYOUT_TILING_Y:
       return I915_TILING_Y;
-   case INTEL_MIPTREE_TILING_NONE:
+   case MIPTREE_LAYOUT_TILING_NONE:
       return I915_TILING_NONE;
    }
 
@@ -762,16 +762,13 @@ intel_miptree_set_total_width_height(struct brw_context *brw,
        mt->total_width, mt->total_height, mt->cpp);
 }
 
-void
-brw_miptree_layout(struct brw_context *brw,
-                   struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
-                   uint32_t layout_flags)
+static void
+intel_miptree_set_alignment(struct brw_context *brw,
+                            struct intel_mipmap_tree *mt,
+                            uint32_t layout_flags)
 {
    bool gen6_hiz_or_stencil = false;
 
-   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
-
    if (brw->gen == 6 && mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
       const GLenum base_format = _mesa_get_format_base_format(mt->format);
       gen6_hiz_or_stencil = _mesa_is_depth_or_stencil_format(base_format);
@@ -806,7 +803,16 @@ brw_miptree_layout(struct brw_context *brw,
          intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);
       mt->align_h = intel_vertical_texture_alignment_unit(brw, mt);
    }
+}
 
+void
+brw_miptree_layout(struct brw_context *brw,
+                   struct intel_mipmap_tree *mt,
+                   uint32_t layout_flags)
+{
+   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
+
+   intel_miptree_set_alignment(brw, mt, layout_flags);
    intel_miptree_set_total_width_height(brw, mt);
 
    if (!mt->total_width || !mt->total_height) {
@@ -825,6 +831,6 @@ brw_miptree_layout(struct brw_context *brw,
    }
 
    if ((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0)
-      mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
+      mt->tiling = brw_miptree_choose_tiling(brw, mt, layout_flags);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 6fcf1b0cb1d..6078c3810d4 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -249,10 +249,10 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits1.cs_fence  = brw->urb.size;
 
    /* erratum: URB_FENCE must not cross a 64byte cacheline */
-   if ((brw->batch.used & 15) > 12) {
-      int pad = 16 - (brw->batch.used & 15);
+   if ((USED_BATCH(brw->batch) & 15) > 12) {
+      int pad = 16 - (USED_BATCH(brw->batch) & 15);
       do
-	 brw->batch.map[brw->batch.used++] = MI_NOOP;
+         *brw->batch.map_next++ = MI_NOOP;
       while (--pad);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index 04e4e944118..68f4318d371 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -53,14 +53,14 @@ brw_get_line_width(struct brw_context *brw)
    float line_width =
       CLAMP(!brw->ctx.Multisample._Enabled && !brw->ctx.Line.SmoothFlag
             ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width,
-            0.0, brw->ctx.Const.MaxLineWidth);
+            0.0f, brw->ctx.Const.MaxLineWidth);
    uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
 
    /* Line width of 0 is not allowed when MSAA enabled */
    if (brw->ctx.Multisample._Enabled) {
       if (line_width_u3_7 == 0)
          line_width_u3_7 = 1;
-   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5) {
+   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5f) {
       /* For 1 pixel line thickness or less, the general
        * anti-aliasing algorithm gives up, and a garbage line is
        * generated.  Setting a Line Width of 0.0 specifies the
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 05f188fe116..63f75da7e99 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -171,6 +171,17 @@ dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
    this->writemask = writemask;
 }
 
+dst_reg::dst_reg(register_file file, int reg, brw_reg_type type,
+                 unsigned writemask)
+{
+   init();
+
+   this->file = file;
+   this->reg = reg;
+   this->type = type;
+   this->writemask = writemask;
+}
+
 dst_reg::dst_reg(struct brw_reg reg)
 {
    init();
@@ -1709,6 +1720,9 @@ vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
 bool
 vec4_visitor::run(gl_clip_plane *clip_planes)
 {
+   bool use_vec4_nir =
+      compiler->glsl_compiler_options[stage].NirOptions != NULL;
+
    sanity_param_count = prog->Parameters->NumParameters;
 
    if (shader_time_index >= 0)
@@ -1718,10 +1732,15 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
 
    emit_prolog();
 
-   /* Generate VS IR for main().  (the visitor only descends into
-    * functions called "main").
-    */
-   if (shader) {
+   if (use_vec4_nir) {
+      assert(prog->nir != NULL);
+      emit_nir_code();
+      if (failed)
+         return false;
+   } else if (shader) {
+      /* Generate VS IR for main().  (the visitor only descends into
+       * functions called "main").
+       */
       visit_instructions(shader->base.ir);
    } else {
       emit_program_code();
@@ -1741,7 +1760,7 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
     * that we have reladdr computations available for CSE, since we'll
     * often do repeated subexpressions for those.
     */
-   if (shader) {
+   if (shader || use_vec4_nir) {
       move_grf_array_access_to_scratch();
       move_uniform_array_access_to_pull_constants();
    } else {
@@ -1827,15 +1846,30 @@ vec4_visitor::run(gl_clip_plane *clip_planes)
       }
    }
 
-   while (!reg_allocate()) {
-      if (failed)
-         return false;
+   bool allocated_without_spills = reg_allocate();
+
+   if (!allocated_without_spills) {
+      compiler->shader_perf_log(log_data,
+                                "%s shader triggered register spilling.  "
+                                "Try reducing the number of live vec4 values "
+                                "to improve performance.\n",
+                                stage_name);
+
+      while (!reg_allocate()) {
+         if (failed)
+            return false;
+      }
    }
 
    opt_schedule_instructions();
 
    opt_set_dependency_control();
 
+   if (last_scratch > 0) {
+      prog_data->base.total_scratch =
+         brw_get_scratch_size(last_scratch * REG_SIZE);
+   }
+
    /* If any state parameters were appended, then ParameterValues could have
     * been realloced, in which case the driver uniform storage set up by
     * _mesa_associate_uniform_storage() would point to freed memory.  Make
@@ -1857,10 +1891,11 @@ extern "C" {
  */
 const unsigned *
 brw_vs_emit(struct brw_context *brw,
-            struct gl_shader_program *prog,
-            struct brw_vs_compile *c,
-            struct brw_vs_prog_data *prog_data,
             void *mem_ctx,
+            const struct brw_vs_prog_key *key,
+            struct brw_vs_prog_data *prog_data,
+            struct gl_vertex_program *vp,
+            struct gl_shader_program *prog,
             unsigned *final_assembly_size)
 {
    bool start_busy = false;
@@ -1879,29 +1914,31 @@ brw_vs_emit(struct brw_context *brw,
 
    int st_index = -1;
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
-      st_index = brw_get_shader_time_index(brw, prog, &c->vp->program.Base,
-                                           ST_VS);
+      st_index = brw_get_shader_time_index(brw, prog, &vp->Base, ST_VS);
 
    if (unlikely(INTEL_DEBUG & DEBUG_VS) && shader->base.ir)
-      brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
+      brw_dump_ir("vertex", prog, &shader->base, &vp->Base);
+
+   if (!vp->Base.nir &&
+       (brw->intelScreen->compiler->scalar_vs ||
+        brw->intelScreen->compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions != NULL)) {
+      /* Normally we generate NIR in LinkShader() or
+       * ProgramStringNotify(), but Mesa's fixed-function vertex program
+       * handling doesn't notify the driver at all.  Just do it here, at
+       * the last minute, even though it's lame.
+       */
+      assert(vp->Base.Id == 0 && prog == NULL);
+      vp->Base.nir =
+         brw_create_nir(brw, NULL, &vp->Base, MESA_SHADER_VERTEX,
+                        brw->intelScreen->compiler->scalar_vs);
+   }
 
    if (brw->intelScreen->compiler->scalar_vs) {
-      if (!c->vp->program.Base.nir) {
-         /* Normally we generate NIR in LinkShader() or
-          * ProgramStringNotify(), but Mesa's fixed-function vertex program
-          * handling doesn't notify the driver at all.  Just do it here, at
-          * the last minute, even though it's lame.
-          */
-         assert(c->vp->program.Base.Id == 0 && prog == NULL);
-         c->vp->program.Base.nir =
-            brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
-      }
-
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
       fs_visitor v(brw->intelScreen->compiler, brw,
-                   mem_ctx, MESA_SHADER_VERTEX, &c->key,
-                   &prog_data->base.base, prog, &c->vp->program.Base,
+                   mem_ctx, MESA_SHADER_VERTEX, key,
+                   &prog_data->base.base, prog, &vp->Base,
                    8, st_index);
       if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
@@ -1916,8 +1953,8 @@ brw_vs_emit(struct brw_context *brw,
       }
 
       fs_generator g(brw->intelScreen->compiler, brw,
-                     mem_ctx, (void *) &c->key, &prog_data->base.base,
-                     &c->vp->program.Base, v.promoted_constants,
+                     mem_ctx, (void *) key, &prog_data->base.base,
+                     &vp->Base, v.promoted_constants,
                      v.runtime_check_aads_emit, "VS");
       if (INTEL_DEBUG & DEBUG_VS) {
          char *name;
@@ -1927,21 +1964,19 @@ brw_vs_emit(struct brw_context *brw,
                                    prog->Name);
          } else {
             name = ralloc_asprintf(mem_ctx, "vertex program %d",
-                                   c->vp->program.Base.Id);
+                                   vp->Base.Id);
          }
          g.enable_debug(name);
       }
       g.generate_code(v.cfg, 8);
       assembly = g.get_assembly(final_assembly_size);
-
-      c->base.last_scratch = v.last_scratch;
    }
 
    if (!assembly) {
       prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-      vec4_vs_visitor v(brw->intelScreen->compiler,
-                        c, prog_data, prog, mem_ctx, st_index,
+      vec4_vs_visitor v(brw->intelScreen->compiler, brw, key, prog_data,
+                        vp, prog, mem_ctx, st_index,
                         !_mesa_is_gles3(&brw->ctx));
       if (!v.run(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
@@ -1956,14 +1991,14 @@ brw_vs_emit(struct brw_context *brw,
       }
 
       vec4_generator g(brw->intelScreen->compiler, brw,
-                       prog, &c->vp->program.Base, &prog_data->base,
+                       prog, &vp->Base, &prog_data->base,
                        mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
       assembly = g.generate_assembly(v.cfg, final_assembly_size);
    }
 
    if (unlikely(brw->perf_debug) && shader) {
       if (shader->compiled_once) {
-         brw_vs_debug_recompile(brw, prog, &c->key);
+         brw_vs_debug_recompile(brw, prog, key);
       }
       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
          perf_debug("VS compile took %.03f ms and stalled the GPU\n",
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 2ac16932189..341c516b39a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -45,12 +45,9 @@ extern "C" {
 #endif
 
 #include "glsl/ir.h"
+#include "glsl/nir/nir.h"
 
 
-struct brw_vec4_compile {
-   GLuint last_scratch; /**< measured in 32-byte (register size) units */
-};
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -77,7 +74,7 @@ class vec4_visitor : public backend_shader, public ir_visitor
 {
 public:
    vec4_visitor(const struct brw_compiler *compiler,
-                struct brw_vec4_compile *c,
+                void *log_data,
                 struct gl_program *prog,
                 const struct brw_vue_prog_key *key,
                 struct brw_vue_prog_data *prog_data,
@@ -103,7 +100,6 @@ public:
       return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
    }
 
-   struct brw_vec4_compile * const c;
    const struct brw_vue_prog_key * const key;
    struct brw_vue_prog_data * const prog_data;
    unsigned int sanity_param_count;
@@ -181,9 +177,12 @@ public:
    void fail(const char *msg, ...);
 
    void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
+   virtual void setup_vector_uniform_values(const gl_constant_value *values,
+                                            unsigned n);
    void setup_uniform_values(ir_variable *ir);
    void setup_builtin_uniform_values(ir_variable *ir);
    int setup_uniforms(int payload_reg);
+
    bool reg_allocate_trivial();
    bool reg_allocate();
    void evaluate_spill_costs(float *spill_costs, bool *no_spill);
@@ -292,14 +291,17 @@ public:
    void emit_bool_to_cond_code(ir_rvalue *ir, enum brw_predicate *predicate);
    void emit_if_gen6(ir_if *ir);
 
-   void emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
-                    src_reg src0, src_reg src1);
+   vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+                                 src_reg src0, src_reg src1);
 
-   void emit_lrp(const dst_reg &dst,
-                 const src_reg &x, const src_reg &y, const src_reg &a);
+   vec4_instruction *emit_lrp(const dst_reg &dst, const src_reg &x,
+                              const src_reg &y, const src_reg &a);
 
-   /** Copy any live channel from \p src to the first channel of \p dst. */
-   void emit_uniformize(const dst_reg &dst, const src_reg &src);
+   /**
+    * Copy any live channel from \p src to the first channel of the
+    * result.
+    */
+   src_reg emit_uniformize(const src_reg &src);
 
    void emit_block_move(dst_reg *dst, src_reg *src,
                         const struct glsl_type *type, brw_predicate predicate);
@@ -317,11 +319,13 @@ public:
    void emit_scalar(ir_instruction *ir, enum prog_opcode op,
 		    dst_reg dst, src_reg src0, src_reg src1);
 
-   src_reg fix_3src_operand(src_reg src);
+   src_reg fix_3src_operand(const src_reg &src);
+   src_reg resolve_source_modifiers(const src_reg &src);
 
-   void emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
-                  const src_reg &src1 = src_reg());
-   src_reg fix_math_operand(src_reg src);
+   vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                               const src_reg &src1 = src_reg());
+
+   src_reg fix_math_operand(const src_reg &src);
 
    void emit_pack_half_2x16(dst_reg dst, src_reg src0);
    void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
@@ -330,10 +334,27 @@ public:
    void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
    void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
 
-   uint32_t gather_channel(ir_texture *ir, uint32_t sampler);
-   src_reg emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler);
+   void emit_texture(ir_texture_opcode op,
+                     dst_reg dest,
+                     const glsl_type *dest_type,
+                     src_reg coordinate,
+                     int coord_components,
+                     src_reg shadow_comparitor,
+                     src_reg lod, src_reg lod2,
+                     src_reg sample_index,
+                     uint32_t constant_offset,
+                     src_reg offset_value,
+                     src_reg mcs,
+                     bool is_cube_array,
+                     uint32_t sampler, src_reg sampler_reg);
+
+   uint32_t gather_channel(unsigned gather_component, uint32_t sampler);
+   src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
+                          src_reg sampler);
    void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
-   void swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler);
+   void swizzle_result(ir_texture_opcode op, dst_reg dest,
+                       src_reg orig_val, uint32_t sampler,
+                       const glsl_type *dest_type);
 
    void emit_ndc_computation();
    void emit_psiz_and_flags(dst_reg reg);
@@ -388,13 +409,53 @@ public:
 
    void visit_atomic_counter_intrinsic(ir_call *ir);
 
+   int type_size(const struct glsl_type *type);
+   bool is_high_sampler(src_reg sampler);
+
+   virtual void emit_nir_code();
+   virtual void nir_setup_inputs(nir_shader *shader);
+   virtual void nir_setup_uniforms(nir_shader *shader);
+   virtual void nir_setup_uniform(nir_variable *var);
+   virtual void nir_setup_builtin_uniform(nir_variable *var);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_setup_system_values(nir_shader *shader);
+   virtual void nir_emit_impl(nir_function_impl *impl);
+   virtual void nir_emit_cf_list(exec_list *list);
+   virtual void nir_emit_if(nir_if *if_stmt);
+   virtual void nir_emit_loop(nir_loop *loop);
+   virtual void nir_emit_block(nir_block *block);
+   virtual void nir_emit_instr(nir_instr *instr);
+   virtual void nir_emit_load_const(nir_load_const_instr *instr);
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_emit_alu(nir_alu_instr *instr);
+   virtual void nir_emit_jump(nir_jump_instr *instr);
+   virtual void nir_emit_texture(nir_tex_instr *instr);
+
+   dst_reg get_nir_dest(nir_dest dest, enum brw_reg_type type);
+   dst_reg get_nir_dest(nir_dest dest, nir_alu_type type);
+   dst_reg get_nir_dest(nir_dest dest);
+   src_reg get_nir_src(nir_src src, enum brw_reg_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(nir_src src, nir_alu_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(nir_src src,
+                       unsigned num_components = 4);
+
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type) = 0;
+
+   dst_reg *nir_locals;
+   dst_reg *nir_ssa_values;
+   src_reg *nir_inputs;
+   unsigned *nir_uniform_driver_location;
+   dst_reg *nir_system_values;
+
 protected:
    void emit_vertex();
    void lower_attributes_to_hw_regs(const int *attribute_map,
                                     bool interleaved);
    void setup_payload_interference(struct ra_graph *g, int first_payload_node,
                                    int reg_node_count);
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir) = 0;
    virtual void assign_binding_table_offsets();
    virtual void setup_payload() = 0;
    virtual void emit_prolog() = 0;
@@ -403,6 +464,8 @@ protected:
    virtual void emit_urb_write_header(int mrf) = 0;
    virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
    virtual int compute_array_stride(ir_dereference_array *ir);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
 
 private:
    /**
@@ -411,6 +474,8 @@ private:
    const bool no_spills;
 
    int shader_time_index;
+
+   unsigned last_scratch; /**< measured in 32-byte (register size) units */
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
index 2d9afa8145f..5a15eb89766 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
@@ -179,6 +179,7 @@ try_constant_propagate(const struct brw_device_info *devinfo,
 
    case BRW_OPCODE_MACH:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_OR:
    case BRW_OPCODE_AND:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index c9fe0cebf27..5a277f74c44 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -62,6 +62,7 @@ is_expression(const vec4_instruction *const inst)
    case BRW_OPCODE_CMPN:
    case BRW_OPCODE_ADD:
    case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
    case BRW_OPCODE_FRC:
    case BRW_OPCODE_RNDU:
    case BRW_OPCODE_RNDD:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index d2de2f0be25..92050b94d33 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1465,19 +1465,15 @@ vec4_generator::generate_code(const cfg_t *cfg)
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen,
                             !inst->dst.is_null());
-         brw_mark_surface_used(&prog_data->base, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-         assert(src[1].file == BRW_IMMEDIATE_VALUE &&
-                src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
                                   src[2].dw1.ud);
-         brw_mark_surface_used(&prog_data->base, src[1].dw1.ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -1549,7 +1545,7 @@ vec4_generator::generate_code(const cfg_t *cfg)
           *
           * where they pack the four bytes from the low and high four DW.
           */
-         assert(is_power_of_two(dst.dw1.bits.writemask) &&
+         assert(_mesa_is_pow_two(dst.dw1.bits.writemask) &&
                 dst.dw1.bits.writemask != 0);
          unsigned offset = __builtin_ctz(dst.dw1.bits.writemask);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
new file mode 100644
index 00000000000..d85fb6f31ec
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4_gs_visitor.h"
+
+namespace brw {
+
+void
+vec4_gs_visitor::nir_setup_inputs(nir_shader *shader)
+{
+   nir_inputs = ralloc_array(mem_ctx, src_reg, shader->num_inputs);
+
+   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
+      int offset = var->data.driver_location;
+      if (var->type->base_type == GLSL_TYPE_ARRAY) {
+         /* Geometry shader inputs are arrays, but they use an unusual array
+          * layout: instead of all array elements for a given geometry shader
+          * input being stored consecutively, all geometry shader inputs are
+          * interleaved into one giant array. At this stage of compilation, we
+          * assume that the stride of the array is BRW_VARYING_SLOT_COUNT.
+          * Later, setup_attributes() will remap our accesses to the actual
+          * input array.
+          */
+         assert(var->type->length > 0);
+         int length = var->type->length;
+         int size = type_size(var->type) / length;
+         for (int i = 0; i < length; i++) {
+            int location = var->data.location + i * BRW_VARYING_SLOT_COUNT;
+            for (int j = 0; j < size; j++) {
+               src_reg src = src_reg(ATTR, location + j, var->type);
+               src = retype(src, brw_type_for_base_type(var->type));
+               nir_inputs[offset] = src;
+               offset++;
+            }
+         }
+      } else {
+         int size = type_size(var->type);
+         for (int i = 0; i < size; i++) {
+            src_reg src = src_reg(ATTR, var->data.location + i, var->type);
+            src = retype(src, brw_type_for_base_type(var->type));
+            nir_inputs[offset] = src;
+            offset++;
+         }
+      }
+   }
+}
+
+void
+vec4_gs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg *reg;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_invocation_id:
+      reg = &this->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INVOCATION_ID,
+                                                 glsl_type::int_type);
+      break;
+
+   default:
+      vec4_visitor::nir_setup_system_value_intrinsic(instr);
+   }
+
+}
+
+void
+vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_emit_vertex: {
+      int stream_id = instr->const_index[0];
+      gs_emit_vertex(stream_id);
+      break;
+   }
+
+   case nir_intrinsic_end_primitive:
+      gs_end_primitive();
+      break;
+
+   case nir_intrinsic_load_invocation_id: {
+      src_reg invocation_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]);
+      assert(invocation_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, invocation_id.type);
+      emit(MOV(dest, invocation_id));
+      break;
+   }
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 69bcf5afc51..019efecac66 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -35,12 +35,14 @@ const unsigned MAX_GS_INPUT_VERTICES = 6;
 namespace brw {
 
 vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
+                                 void *log_data,
                                  struct brw_gs_compile *c,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
                                  bool no_spills,
                                  int shader_time_index)
-   : vec4_visitor(compiler, &c->base, &c->gp->program.Base, &c->key.base,
+   : vec4_visitor(compiler, log_data,
+                  &c->gp->program.Base, &c->key.base,
                   &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
                   no_spills, shader_time_index),
      c(c)
@@ -49,11 +51,12 @@ vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
 
 
 dst_reg *
-vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir)
+vec4_gs_visitor::make_reg_for_system_value(int location,
+                                           const glsl_type *type)
 {
-   dst_reg *reg = new(mem_ctx) dst_reg(this, ir->type);
+   dst_reg *reg = new(mem_ctx) dst_reg(this, type);
 
-   switch (ir->data.location) {
+   switch (location) {
    case SYSTEM_VALUE_INVOCATION_ID:
       this->current_annotation = "initialize gl_InvocationID";
       emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
@@ -346,90 +349,82 @@ vec4_gs_visitor::emit_control_data_bits()
    if (c->control_data_header_size_bits > 128)
       urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
 
-   /* If vertex_count is 0, then no control data bits have been accumulated
-    * yet, so we should do nothing.
+   /* If we are using either channel masks or a per-slot offset, then we
+    * need to figure out which DWORD we are trying to write to, using the
+    * formula:
+    *
+    *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
+    *
+    * Since bits_per_vertex is a power of two, and is known at compile
+    * time, this can be optimized to:
+    *
+    *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
     */
-   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
-   emit(IF(BRW_PREDICATE_NORMAL));
-   {
-      /* If we are using either channel masks or a per-slot offset, then we
-       * need to figure out which DWORD we are trying to write to, using the
-       * formula:
-       *
-       *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
-       *
-       * Since bits_per_vertex is a power of two, and is known at compile
-       * time, this can be optimized to:
-       *
-       *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
-       */
-      src_reg dword_index(this, glsl_type::uint_type);
-      if (urb_write_flags) {
-         src_reg prev_count(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
-         unsigned log2_bits_per_vertex =
-            _mesa_fls(c->control_data_bits_per_vertex);
-         emit(SHR(dst_reg(dword_index), prev_count,
-                  (uint32_t) (6 - log2_bits_per_vertex)));
-      }
-
-      /* Start building the URB write message.  The first MRF gets a copy of
-       * R0.
-       */
-      int base_mrf = 1;
-      dst_reg mrf_reg(MRF, base_mrf);
-      src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-      vec4_instruction *inst = emit(MOV(mrf_reg, r0));
-      inst->force_writemask_all = true;
-
-      if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
-         /* Set the per-slot offset to dword_index / 4, to that we'll write to
-          * the appropriate OWORD within the control data header.
-          */
-         src_reg per_slot_offset(this, glsl_type::uint_type);
-         emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
-         emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
-      }
-
-      if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
-         /* Set the channel masks to 1 << (dword_index % 4), so that we'll
-          * write to the appropriate DWORD within the OWORD.  We need to do
-          * this computation with force_writemask_all, otherwise garbage data
-          * from invocation 0 might clobber the mask for invocation 1 when
-          * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
-          * together.
-          */
-         src_reg channel(this, glsl_type::uint_type);
-         inst = emit(AND(dst_reg(channel), dword_index, 3u));
-         inst->force_writemask_all = true;
-         src_reg one(this, glsl_type::uint_type);
-         inst = emit(MOV(dst_reg(one), 1u));
-         inst->force_writemask_all = true;
-         src_reg channel_mask(this, glsl_type::uint_type);
-         inst = emit(SHL(dst_reg(channel_mask), one, channel));
-         inst->force_writemask_all = true;
-         emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
-                                               channel_mask);
-         emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
-      }
-
-      /* Store the control data bits in the message payload and send it. */
-      dst_reg mrf_reg2(MRF, base_mrf + 1);
-      inst = emit(MOV(mrf_reg2, this->control_data_bits));
-      inst->force_writemask_all = true;
-      inst = emit(GS_OPCODE_URB_WRITE);
-      inst->urb_write_flags = urb_write_flags;
-      /* We need to increment Global Offset by 256-bits to make room for
-       * Broadwell's extra "Vertex Count" payload at the beginning of the
-       * URB entry.  Since this is an OWord message, Global Offset is counted
-       * in 128-bit units, so we must set it to 2.
-       */
-      if (devinfo->gen >= 8)
-         inst->offset = 2;
-      inst->base_mrf = base_mrf;
-      inst->mlen = 2;
+   src_reg dword_index(this, glsl_type::uint_type);
+   if (urb_write_flags) {
+      src_reg prev_count(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
+      unsigned log2_bits_per_vertex =
+         _mesa_fls(c->control_data_bits_per_vertex);
+      emit(SHR(dst_reg(dword_index), prev_count,
+               (uint32_t) (6 - log2_bits_per_vertex)));
    }
-   emit(BRW_OPCODE_ENDIF);
+
+   /* Start building the URB write message.  The first MRF gets a copy of
+    * R0.
+    */
+   int base_mrf = 1;
+   dst_reg mrf_reg(MRF, base_mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+
+   if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
+      /* Set the per-slot offset to dword_index / 4, to that we'll write to
+       * the appropriate OWORD within the control data header.
+       */
+      src_reg per_slot_offset(this, glsl_type::uint_type);
+      emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
+      emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
+   }
+
+   if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
+      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+       * write to the appropriate DWORD within the OWORD.  We need to do
+       * this computation with force_writemask_all, otherwise garbage data
+       * from invocation 0 might clobber the mask for invocation 1 when
+       * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
+       * together.
+       */
+      src_reg channel(this, glsl_type::uint_type);
+      inst = emit(AND(dst_reg(channel), dword_index, 3u));
+      inst->force_writemask_all = true;
+      src_reg one(this, glsl_type::uint_type);
+      inst = emit(MOV(dst_reg(one), 1u));
+      inst->force_writemask_all = true;
+      src_reg channel_mask(this, glsl_type::uint_type);
+      inst = emit(SHL(dst_reg(channel_mask), one, channel));
+      inst->force_writemask_all = true;
+      emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
+                                            channel_mask);
+      emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
+   }
+
+   /* Store the control data bits in the message payload and send it. */
+   dst_reg mrf_reg2(MRF, base_mrf + 1);
+   inst = emit(MOV(mrf_reg2, this->control_data_bits));
+   inst->force_writemask_all = true;
+   inst = emit(GS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = urb_write_flags;
+   /* We need to increment Global Offset by 256-bits to make room for
+    * Broadwell's extra "Vertex Count" payload at the beginning of the
+    * URB entry.  Since this is an OWord message, Global Offset is counted
+    * in 128-bit units, so we must set it to 2.
+    */
+   if (devinfo->gen >= 8)
+      inst->offset = 2;
+   inst->base_mrf = base_mrf;
+   inst->mlen = 2;
 }
 
 void
@@ -472,7 +467,7 @@ vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
 }
 
 void
-vec4_gs_visitor::visit(ir_emit_vertex *ir)
+vec4_gs_visitor::gs_emit_vertex(int stream_id)
 {
    this->current_annotation = "emit vertex: safety check";
 
@@ -486,7 +481,7 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
     * be recorded by transform feedback, we can simply discard all geometry
     * bound to these streams when transform feedback is disabled.
     */
-   if (ir->stream_id() > 0 && shader_prog->TransformFeedback.NumVarying == 0)
+   if (stream_id > 0 && shader_prog->TransformFeedback.NumVarying == 0)
       return;
 
    /* To ensure that we don't output more vertices than the shader specified
@@ -529,9 +524,17 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
             emit(AND(dst_null_d(), this->vertex_count,
                      (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
          inst->conditional_mod = BRW_CONDITIONAL_Z;
+
          emit(IF(BRW_PREDICATE_NORMAL));
          {
+            /* If vertex_count is 0, then no control data bits have been
+             * accumulated yet, so we skip emitting them.
+             */
+            emit(CMP(dst_null_d(), this->vertex_count, 0u,
+                     BRW_CONDITIONAL_NEQ));
+            emit(IF(BRW_PREDICATE_NORMAL));
             emit_control_data_bits();
+            emit(BRW_OPCODE_ENDIF);
 
             /* Reset control_data_bits to 0 so we can start accumulating a new
              * batch.
@@ -557,7 +560,7 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
           c->prog_data.control_data_format ==
              GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
           this->current_annotation = "emit vertex: Stream control data bits";
-          set_stream_control_data_bits(ir->stream_id());
+          set_stream_control_data_bits(stream_id);
       }
 
       this->current_annotation = "emit vertex: increment vertex count";
@@ -570,7 +573,13 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
 }
 
 void
-vec4_gs_visitor::visit(ir_end_primitive *)
+vec4_gs_visitor::visit(ir_emit_vertex *ir)
+{
+   gs_emit_vertex(ir->stream_id());
+}
+
+void
+vec4_gs_visitor::gs_end_primitive()
 {
    /* We can only do EndPrimitive() functionality when the control data
     * consists of cut bits.  Fortunately, the only time it isn't is when the
@@ -620,6 +629,12 @@ vec4_gs_visitor::visit(ir_end_primitive *)
    emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
 }
 
+void
+vec4_gs_visitor::visit(ir_end_primitive *)
+{
+   gs_end_primitive();
+}
+
 static const unsigned *
 generate_assembly(struct brw_context *brw,
                   struct gl_shader_program *shader_prog,
@@ -662,7 +677,7 @@ brw_gs_emit(struct brw_context *brw,
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
          c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(brw->intelScreen->compiler,
+         vec4_gs_visitor v(brw->intelScreen->compiler, brw,
                            c, prog, mem_ctx, true /* no_spills */, st_index);
          if (v.run(NULL /* clip planes */)) {
             return generate_assembly(brw, prog, &c->gp->program.Base,
@@ -704,11 +719,11 @@ brw_gs_emit(struct brw_context *brw,
    const unsigned *ret = NULL;
 
    if (brw->gen >= 7)
-      gs = new vec4_gs_visitor(brw->intelScreen->compiler,
+      gs = new vec4_gs_visitor(brw->intelScreen->compiler, brw,
                                c, prog, mem_ctx, false /* no_spills */,
                                st_index);
    else
-      gs = new gen6_gs_visitor(brw->intelScreen->compiler,
+      gs = new gen6_gs_visitor(brw->intelScreen->compiler, brw,
                                c, prog, mem_ctx, false /* no_spills */,
                                st_index);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index e693c56b58f..0e8fefabecc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -37,7 +37,6 @@
  */
 struct brw_gs_compile
 {
-   struct brw_vec4_compile base;
    struct brw_gs_prog_key key;
    struct brw_gs_prog_data prog_data;
    struct brw_vue_map input_vue_map;
@@ -69,14 +68,19 @@ class vec4_gs_visitor : public vec4_visitor
 {
 public:
    vec4_gs_visitor(const struct brw_compiler *compiler,
+                   void *log_data,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index);
 
+   virtual void nir_setup_inputs(nir_shader *shader);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_program_code();
@@ -86,6 +90,9 @@ protected:
    virtual int compute_array_stride(ir_dereference_array *ir);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
 
 protected:
    int setup_varying_inputs(int payload_reg, int *attribute_map,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
index 95b9d9017e2..cc688ef8083 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
@@ -96,7 +96,8 @@ vec4_live_variables::setup_def_use()
 	  * are the things that screen off preceding definitions of a
 	  * variable, and thus qualify for being in def[].
 	  */
-	 if (inst->dst.file == GRF && !inst->predicate) {
+	 if (inst->dst.file == GRF &&
+	     (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) {
             for (unsigned i = 0; i < inst->regs_written; i++) {
                for (int c = 0; c < 4; c++) {
                   if (inst->dst.writemask & (1 << c)) {
@@ -133,27 +134,9 @@ vec4_live_variables::compute_live_variables()
    while (cont) {
       cont = false;
 
-      foreach_block (block, cfg) {
+      foreach_block_reverse (block, cfg) {
          struct block_data *bd = &block_data[block->num];
 
-	 /* Update livein */
-	 for (int i = 0; i < bitset_words; i++) {
-            BITSET_WORD new_livein = (bd->use[i] |
-                                      (bd->liveout[i] &
-                                       ~bd->def[i]));
-            if (new_livein & ~bd->livein[i]) {
-               bd->livein[i] |= new_livein;
-               cont = true;
-	    }
-	 }
-         BITSET_WORD new_livein = (bd->flag_use[0] |
-                                   (bd->flag_liveout[0] &
-                                    ~bd->flag_def[0]));
-         if (new_livein & ~bd->flag_livein[0]) {
-            bd->flag_livein[0] |= new_livein;
-            cont = true;
-         }
-
 	 /* Update liveout */
 	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
             struct block_data *child_bd = &block_data[child_link->block->num];
@@ -173,6 +156,24 @@ vec4_live_variables::compute_live_variables()
                cont = true;
             }
 	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
       }
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
new file mode 100644
index 00000000000..923e2d30a4c
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -0,0 +1,1548 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4.h"
+#include "glsl/ir_uniform.h"
+
+namespace brw {
+
+void
+vec4_visitor::emit_nir_code()
+{
+   nir_shader *nir = prog->nir;
+
+   if (nir->num_inputs > 0)
+      nir_setup_inputs(nir);
+
+   if (nir->num_uniforms > 0)
+      nir_setup_uniforms(nir);
+
+   nir_setup_system_values(nir);
+
+   /* get the main function and emit it */
+   nir_foreach_overload(nir, overload) {
+      assert(strcmp(overload->function->name, "main") == 0);
+      assert(overload->impl);
+      nir_emit_impl(overload->impl);
+   }
+}
+
+void
+vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg *reg;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id().");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+      reg = &this->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+      if (reg->file == BAD_FILE)
+         *reg =
+            *this->make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
+                                             glsl_type::int_type);
+      break;
+
+   case nir_intrinsic_load_base_vertex:
+      reg = &this->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX,
+                                                 glsl_type::int_type);
+      break;
+
+   case nir_intrinsic_load_instance_id:
+      reg = &this->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID,
+                                                 glsl_type::int_type);
+      break;
+
+   default:
+      break;
+   }
+}
+
+static bool
+setup_system_values_block(nir_block *block, void *void_visitor)
+{
+   vec4_visitor *v = (vec4_visitor *)void_visitor;
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      v->nir_setup_system_value_intrinsic(intrin);
+   }
+
+   return true;
+}
+
+void
+vec4_visitor::nir_setup_system_values(nir_shader *shader)
+{
+   nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
+
+   nir_foreach_overload(shader, overload) {
+      assert(strcmp(overload->function->name, "main") == 0);
+      assert(overload->impl);
+      nir_foreach_block(overload->impl, setup_system_values_block, this);
+   }
+}
+
+void
+vec4_visitor::nir_setup_inputs(nir_shader *shader)
+{
+   nir_inputs = ralloc_array(mem_ctx, src_reg, shader->num_inputs);
+
+   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
+      int offset = var->data.driver_location;
+      unsigned size = type_size(var->type);
+      for (unsigned i = 0; i < size; i++) {
+         src_reg src = src_reg(ATTR, var->data.location + i, var->type);
+         nir_inputs[offset + i] = src;
+      }
+   }
+}
+
+void
+vec4_visitor::nir_setup_uniforms(nir_shader *shader)
+{
+   uniforms = 0;
+
+   nir_uniform_driver_location =
+      rzalloc_array(mem_ctx, unsigned, this->uniform_array_size);
+
+   if (shader_prog) {
+      foreach_list_typed(nir_variable, var, node, &shader->uniforms) {
+         /* UBO's, atomics and samplers don't take up space in the
+            uniform file */
+         if (var->interface_type != NULL || var->type->contains_atomic() ||
+             type_size(var->type) == 0) {
+            continue;
+         }
+
+         assert(uniforms < uniform_array_size);
+         this->uniform_size[uniforms] = type_size(var->type);
+
+         if (strncmp(var->name, "gl_", 3) == 0)
+            nir_setup_builtin_uniform(var);
+         else
+            nir_setup_uniform(var);
+      }
+   } else {
+      /* For ARB_vertex_program, only a single "parameters" variable is
+       * generated to support uniform data.
+       */
+      nir_variable *var = (nir_variable *) shader->uniforms.get_head();
+      assert(shader->uniforms.length() == 1 &&
+             strcmp(var->name, "parameters") == 0);
+
+      assert(uniforms < uniform_array_size);
+      this->uniform_size[uniforms] = type_size(var->type);
+
+      struct gl_program_parameter_list *plist = prog->Parameters;
+      for (unsigned p = 0; p < plist->NumParameters; p++) {
+         uniform_vector_size[uniforms] = plist->Parameters[p].Size;
+
+         /* Parameters should be either vec4 uniforms or single component
+          * constants; matrices and other larger types should have been broken
+          * down earlier.
+          */
+         assert(uniform_vector_size[uniforms] <= 4);
+
+         int i;
+         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
+            stage_prog_data->param[uniforms * 4 + i] = &plist->ParameterValues[p][i];
+         }
+         for (; i < 4; i++) {
+            static const gl_constant_value zero = { 0.0 };
+            stage_prog_data->param[uniforms * 4 + i] = &zero;
+         }
+
+         nir_uniform_driver_location[uniforms] = var->data.driver_location;
+         uniforms++;
+      }
+   }
+}
+
+void
+vec4_visitor::nir_setup_uniform(nir_variable *var)
+{
+   int namelen = strlen(var->name);
+
+   /* The data for our (non-builtin) uniforms is stored in a series of
+    * gl_uniform_driver_storage structs for each subcomponent that
+    * glGetUniformLocation() could name.  We know it's been set up in the same
+    * order we'd walk the type, so walk the list of storage and find anything
+    * with our name, or the prefix of a component that starts with our name.
+    */
+    for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
+       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
+
+       if (storage->builtin)
+          continue;
+
+       if (strncmp(var->name, storage->name, namelen) != 0 ||
+           (storage->name[namelen] != 0 &&
+            storage->name[namelen] != '.' &&
+            storage->name[namelen] != '[')) {
+          continue;
+       }
+
+       gl_constant_value *components = storage->storage;
+       unsigned vector_count = (MAX2(storage->array_elements, 1) *
+                                storage->type->matrix_columns);
+
+       for (unsigned s = 0; s < vector_count; s++) {
+          assert(uniforms < uniform_array_size);
+          uniform_vector_size[uniforms] = storage->type->vector_elements;
+
+          int i;
+          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
+             stage_prog_data->param[uniforms * 4 + i] = components;
+             components++;
+          }
+          for (; i < 4; i++) {
+             static const gl_constant_value zero = { 0.0 };
+             stage_prog_data->param[uniforms * 4 + i] = &zero;
+          }
+
+          nir_uniform_driver_location[uniforms] = var->data.driver_location;
+          uniforms++;
+       }
+    }
+}
+
+void
+vec4_visitor::nir_setup_builtin_uniform(nir_variable *var)
+{
+   const nir_state_slot *const slots = var->state_slots;
+   assert(var->state_slots != NULL);
+
+   for (unsigned int i = 0; i < var->num_state_slots; i++) {
+      /* This state reference has already been setup by ir_to_mesa,
+       * but we'll get the same index back here.  We can reference
+       * ParameterValues directly, since unlike brw_fs.cpp, we never
+       * add new state references during compile.
+       */
+      int index = _mesa_add_state_reference(this->prog->Parameters,
+					    (gl_state_index *)slots[i].tokens);
+      gl_constant_value *values =
+         &this->prog->Parameters->ParameterValues[index][0];
+
+      assert(uniforms < uniform_array_size);
+
+      for (unsigned j = 0; j < 4; j++)
+         stage_prog_data->param[uniforms * 4 + j] =
+            &values[GET_SWZ(slots[i].swizzle, j)];
+
+      this->uniform_vector_size[uniforms] =
+         (var->type->is_scalar() || var->type->is_vector() ||
+          var->type->is_matrix() ? var->type->vector_elements : 4);
+
+      nir_uniform_driver_location[uniforms] = var->data.driver_location;
+      uniforms++;
+   }
+}
+
+void
+vec4_visitor::nir_emit_impl(nir_function_impl *impl)
+{
+   nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc);
+
+   foreach_list_typed(nir_register, reg, node, &impl->registers) {
+      unsigned array_elems =
+         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
+
+      nir_locals[reg->index] = dst_reg(GRF, alloc.allocate(array_elems));
+   }
+
+   nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
+
+   nir_emit_cf_list(&impl->body);
+}
+
+void
+vec4_visitor::nir_emit_cf_list(exec_list *list)
+{
+   exec_list_validate(list);
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_if:
+         nir_emit_if(nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         nir_emit_loop(nir_cf_node_as_loop(node));
+         break;
+
+      case nir_cf_node_block:
+         nir_emit_block(nir_cf_node_as_block(node));
+         break;
+
+      default:
+         unreachable("Invalid CFG node block");
+      }
+   }
+}
+
+void
+vec4_visitor::nir_emit_if(nir_if *if_stmt)
+{
+   /* First, put the condition in f0 */
+   src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1);
+   vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+   emit(IF(BRW_PREDICATE_NORMAL));
+
+   nir_emit_cf_list(&if_stmt->then_list);
+
+   /* note: if the else is empty, dead CF elimination will remove it */
+   emit(BRW_OPCODE_ELSE);
+
+   nir_emit_cf_list(&if_stmt->else_list);
+
+   emit(BRW_OPCODE_ENDIF);
+}
+
+void
+vec4_visitor::nir_emit_loop(nir_loop *loop)
+{
+   emit(BRW_OPCODE_DO);
+
+   nir_emit_cf_list(&loop->body);
+
+   emit(BRW_OPCODE_WHILE);
+}
+
+void
+vec4_visitor::nir_emit_block(nir_block *block)
+{
+   nir_foreach_instr(block, instr) {
+      nir_emit_instr(instr);
+   }
+}
+
+void
+vec4_visitor::nir_emit_instr(nir_instr *instr)
+{
+   this->base_ir = instr;
+
+   switch (instr->type) {
+   case nir_instr_type_load_const:
+      nir_emit_load_const(nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_intrinsic:
+      nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
+      break;
+
+   case nir_instr_type_alu:
+      nir_emit_alu(nir_instr_as_alu(instr));
+      break;
+
+   case nir_instr_type_jump:
+      nir_emit_jump(nir_instr_as_jump(instr));
+      break;
+
+   case nir_instr_type_tex:
+      nir_emit_texture(nir_instr_as_tex(instr));
+      break;
+
+   default:
+      fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n");
+      break;
+   }
+}
+
+static dst_reg
+dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg,
+                    unsigned base_offset, nir_src *indirect)
+{
+   dst_reg reg;
+
+   reg = v->nir_locals[nir_reg->index];
+   reg = offset(reg, base_offset);
+   if (indirect) {
+      reg.reladdr =
+         new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
+                                                BRW_REGISTER_TYPE_D,
+                                                1));
+   }
+   return reg;
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest)
+{
+   assert(!dest.is_ssa);
+   return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
+                              dest.reg.indirect);
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest, enum brw_reg_type type)
+{
+   return retype(get_nir_dest(dest), type);
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(nir_dest dest, nir_alu_type type)
+{
+   return get_nir_dest(dest, brw_type_for_nir_type(type));
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, enum brw_reg_type type,
+                          unsigned num_components)
+{
+   dst_reg reg;
+
+   if (src.is_ssa) {
+      assert(src.ssa != NULL);
+      reg = nir_ssa_values[src.ssa->index];
+   }
+   else {
+     reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
+                               src.reg.indirect);
+   }
+
+   reg = retype(reg, type);
+
+   src_reg reg_as_src = src_reg(reg);
+   reg_as_src.swizzle = brw_swizzle_for_size(num_components);
+   return reg_as_src;
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, nir_alu_type type,
+                          unsigned num_components)
+{
+   return get_nir_src(src, brw_type_for_nir_type(type), num_components);
+}
+
+src_reg
+vec4_visitor::get_nir_src(nir_src src, unsigned num_components)
+{
+   /* if type is not specified, default to signed int */
+   return get_nir_src(src, nir_type_int, num_components);
+}
+
+void
+vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
+{
+   dst_reg reg = dst_reg(GRF, alloc.allocate(1));
+   reg.type =  BRW_REGISTER_TYPE_F;
+
+   /* @FIXME: consider emitting vector operations to save some MOVs in
+    * cases where the components are representable in 8 bits.
+    * By now, we emit a MOV for each component.
+    */
+   for (unsigned i = 0; i < instr->def.num_components; ++i) {
+      reg.writemask = 1 << i;
+      emit(MOV(reg, src_reg(instr->value.f[i])));
+   }
+
+   /* Set final writemask */
+   reg.writemask = brw_writemask_for_size(instr->def.num_components);
+
+   nir_ssa_values[instr->def.index] = reg;
+}
+
+void
+vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   bool has_indirect = false;
+
+   switch (instr->intrinsic) {
+
+   case nir_intrinsic_load_input_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_load_input: {
+      int offset = instr->const_index[0];
+      src = nir_inputs[offset];
+
+      if (has_indirect) {
+         dest.reladdr = new(mem_ctx) src_reg(get_nir_src(instr->src[0],
+                                                         BRW_REGISTER_TYPE_D,
+                                                         1));
+      }
+      dest = get_nir_dest(instr->dest, src.type);
+      dest.writemask = brw_writemask_for_size(instr->num_components);
+
+      emit(MOV(dest, src));
+      break;
+   }
+
+   case nir_intrinsic_store_output_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_store_output: {
+      int varying = instr->const_index[0];
+
+      src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
+                        instr->num_components);
+      dest = dst_reg(src);
+
+      if (has_indirect) {
+         dest.reladdr = new(mem_ctx) src_reg(get_nir_src(instr->src[1],
+                                                         BRW_REGISTER_TYPE_D,
+                                                         1));
+      }
+      output_reg[varying] = dest;
+      break;
+   }
+
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id()");
+
+   case nir_intrinsic_load_vertex_id_zero_base: {
+      src_reg vertex_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE]);
+      assert(vertex_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, vertex_id.type);
+      emit(MOV(dest, vertex_id));
+      break;
+   }
+
+   case nir_intrinsic_load_base_vertex: {
+      src_reg base_vertex =
+         src_reg(nir_system_values[SYSTEM_VALUE_BASE_VERTEX]);
+      assert(base_vertex.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, base_vertex.type);
+      emit(MOV(dest, base_vertex));
+      break;
+   }
+
+   case nir_intrinsic_load_instance_id: {
+      src_reg instance_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_INSTANCE_ID]);
+      assert(instance_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, instance_id.type);
+      emit(MOV(dest, instance_id));
+      break;
+   }
+
+   case nir_intrinsic_load_uniform_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_load_uniform: {
+      int uniform = instr->const_index[0];
+
+      dest = get_nir_dest(instr->dest);
+
+      if (has_indirect) {
+         /* Split addressing into uniform and offset */
+         int offset = uniform - nir_uniform_driver_location[uniform];
+         assert(offset >= 0);
+
+         uniform -= offset;
+         assert(uniform >= 0);
+
+         src = src_reg(dst_reg(UNIFORM, uniform));
+         src.reg_offset = offset;
+         src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1);
+         src.reladdr = new(mem_ctx) src_reg(tmp);
+      } else {
+         src = src_reg(dst_reg(UNIFORM, uniform));
+      }
+
+      emit(MOV(dest, src));
+      break;
+   }
+
+   case nir_intrinsic_atomic_counter_read:
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_dec: {
+      unsigned surf_index = prog_data->base.binding_table.abo_start +
+         (unsigned) instr->const_index[0];
+      src_reg offset = get_nir_src(instr->src[0], nir_type_int,
+                                   instr->num_components);
+      dest = get_nir_dest(instr->dest);
+
+      switch (instr->intrinsic) {
+         case nir_intrinsic_atomic_counter_inc:
+            emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
+                                src_reg(), src_reg());
+            break;
+         case nir_intrinsic_atomic_counter_dec:
+            emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
+                                src_reg(), src_reg());
+            break;
+         case nir_intrinsic_atomic_counter_read:
+            emit_untyped_surface_read(surf_index, dest, offset);
+            break;
+         default:
+            unreachable("Unreachable");
+      }
+
+      brw_mark_surface_used(stage_prog_data, surf_index);
+      break;
+   }
+
+   case nir_intrinsic_load_ubo_indirect:
+      has_indirect = true;
+      /* fallthrough */
+   case nir_intrinsic_load_ubo: {
+      nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]);
+      src_reg surf_index;
+
+      dest = get_nir_dest(instr->dest);
+
+      if (const_block_index) {
+         /* The block index is a constant, so just emit the binding table entry
+          * as an immediate.
+          */
+         surf_index = src_reg(prog_data->base.binding_table.ubo_start +
+                              const_block_index->u[0]);
+      } else {
+         /* The block index is not a constant. Evaluate the index expression
+          * per-channel and add the base UBO index; we have to select a value
+          * from any live channel.
+          */
+         surf_index = src_reg(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int,
+                                                   instr->num_components),
+                  src_reg(prog_data->base.binding_table.ubo_start)));
+         surf_index = emit_uniformize(surf_index);
+
+         /* Assume this may touch any UBO. It would be nice to provide
+          * a tighter bound, but the array information is already lowered away.
+          */
+         brw_mark_surface_used(&prog_data->base,
+                               prog_data->base.binding_table.ubo_start +
+                               shader_prog->NumUniformBlocks - 1);
+      }
+
+      unsigned const_offset = instr->const_index[0];
+      src_reg offset;
+
+      if (!has_indirect)  {
+         offset = src_reg(const_offset / 16);
+      } else {
+         offset = src_reg(this, glsl_type::uint_type);
+         emit(SHR(dst_reg(offset), get_nir_src(instr->src[1], nir_type_int, 1),
+                  src_reg(4u)));
+      }
+
+      src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
+      packed_consts.type = dest.type;
+
+      emit_pull_constant_load_reg(dst_reg(packed_consts),
+                                  surf_index,
+                                  offset,
+                                  NULL, NULL /* before_block/inst */);
+
+      packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
+      packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
+                                            const_offset % 16 / 4,
+                                            const_offset % 16 / 4,
+                                            const_offset % 16 / 4);
+
+      emit(MOV(dest, packed_consts));
+      break;
+   }
+
+   default:
+      unreachable("Unknown intrinsic");
+   }
+}
+
+static unsigned
+brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
+{
+   return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+}
+
+static enum brw_conditional_mod
+brw_conditional_for_nir_comparison(nir_op op)
+{
+   switch (op) {
+   case nir_op_flt:
+   case nir_op_ilt:
+   case nir_op_ult:
+      return BRW_CONDITIONAL_L;
+
+   case nir_op_fge:
+   case nir_op_ige:
+   case nir_op_uge:
+      return BRW_CONDITIONAL_GE;
+
+   case nir_op_feq:
+   case nir_op_ieq:
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4:
+      return BRW_CONDITIONAL_Z;
+
+   case nir_op_fne:
+   case nir_op_ine:
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4:
+      return BRW_CONDITIONAL_NZ;
+
+   default:
+      unreachable("not reached: bad operation for comparison");
+   }
+}
+
+void
+vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
+{
+   vec4_instruction *inst;
+
+   dst_reg dst = get_nir_dest(instr->dest.dest,
+                              nir_op_infos[instr->op].output_type);
+   dst.writemask = instr->dest.write_mask;
+
+   src_reg op[4];
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      op[i] = get_nir_src(instr->src[i].src,
+                          nir_op_infos[instr->op].input_types[i], 4);
+      op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle);
+      op[i].abs = instr->src[i].abs;
+      op[i].negate = instr->src[i].negate;
+   }
+
+   switch (instr->op) {
+   case nir_op_imov:
+   case nir_op_fmov:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+      unreachable("not reached: should be handled by lower_vec_to_movs()");
+
+   case nir_op_i2f:
+   case nir_op_u2f:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_f2i:
+   case nir_op_f2u:
+      inst = emit(MOV(dst, op[0]));
+      break;
+
+   case nir_op_fadd:
+      /* fall through */
+   case nir_op_iadd:
+      inst = emit(ADD(dst, op[0], op[1]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmul:
+      inst = emit(MUL(dst, op[0], op[1]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_imul: {
+      if (devinfo->gen < 8) {
+         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
+         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+
+         /* For integer multiplication, the MUL uses the low 16 bits of one of
+          * the operands (src0 through SNB, src1 on IVB and later). The MACH
+          * accumulates in the contribution of the upper 16 bits of that
+          * operand. If we can determine that one of the args is in the low
+          * 16 bits, though, we can just emit a single MUL.
+          */
+         if (value0 && value0->u[0] < (1 << 16)) {
+            if (devinfo->gen < 7)
+               emit(MUL(dst, op[0], op[1]));
+            else
+               emit(MUL(dst, op[1], op[0]));
+         } else if (value1 && value1->u[0] < (1 << 16)) {
+            if (devinfo->gen < 7)
+               emit(MUL(dst, op[1], op[0]));
+            else
+               emit(MUL(dst, op[0], op[1]));
+         } else {
+            struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+            emit(MUL(acc, op[0], op[1]));
+            emit(MACH(dst_null_d(), op[0], op[1]));
+            emit(MOV(dst, src_reg(acc)));
+         }
+      } else {
+	 emit(MUL(dst, op[0], op[1]));
+      }
+      break;
+   }
+
+   case nir_op_imul_high:
+   case nir_op_umul_high: {
+      struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+      emit(MUL(acc, op[0], op[1]));
+      emit(MACH(dst, op[0], op[1]));
+      break;
+   }
+
+   case nir_op_frcp:
+      inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fexp2:
+      inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_flog2:
+      inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fsin:
+      inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fcos:
+      inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_idiv:
+   case nir_op_udiv:
+      emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
+      break;
+
+   case nir_op_umod:
+      emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+      break;
+
+   case nir_op_ldexp:
+      unreachable("not reached: should be handled by ldexp_to_arith()");
+
+   case nir_op_fsqrt:
+      inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_frsq:
+      inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fpow:
+      inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_uadd_carry: {
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(ADDC(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_usub_borrow: {
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(SUBB(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_ftrunc:
+      inst = emit(RNDZ(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fceil: {
+      src_reg tmp = src_reg(this, glsl_type::float_type);
+      tmp.swizzle =
+         brw_swizzle_for_size(instr->src[0].src.is_ssa ?
+                              instr->src[0].src.ssa->num_components :
+                              instr->src[0].src.reg.reg->num_components);
+
+      op[0].negate = !op[0].negate;
+      emit(RNDD(dst_reg(tmp), op[0]));
+      tmp.negate = true;
+      inst = emit(MOV(dst, tmp));
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
+   case nir_op_ffloor:
+      inst = emit(RNDD(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_ffract:
+      inst = emit(FRC(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fround_even:
+      inst = emit(RNDE(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmin:
+   case nir_op_imin:
+   case nir_op_umin:
+      inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmax:
+   case nir_op_imax:
+   case nir_op_umax:
+      inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fddx:
+   case nir_op_fddx_coarse:
+   case nir_op_fddx_fine:
+   case nir_op_fddy:
+   case nir_op_fddy_coarse:
+   case nir_op_fddy_fine:
+      unreachable("derivatives are not valid in vertex shaders");
+
+   case nir_op_flt:
+   case nir_op_ilt:
+   case nir_op_ult:
+   case nir_op_fge:
+   case nir_op_ige:
+   case nir_op_uge:
+   case nir_op_feq:
+   case nir_op_ieq:
+   case nir_op_fne:
+   case nir_op_ine:
+      emit(CMP(dst, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+      break;
+
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+
+      switch (instr->op) {
+      case nir_op_ball_fequal2:
+      case nir_op_ball_iequal2:
+         tmp.writemask = WRITEMASK_XY;
+         break;
+      case nir_op_ball_fequal3:
+      case nir_op_ball_iequal3:
+         tmp.writemask = WRITEMASK_XYZ;
+         break;
+      case nir_op_ball_fequal4:
+      case nir_op_ball_iequal4:
+         tmp.writemask = WRITEMASK_XYZW;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      emit(CMP(tmp, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   }
+
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+
+      switch (instr->op) {
+      case nir_op_bany_fnequal2:
+      case nir_op_bany_inequal2:
+         tmp.writemask = WRITEMASK_XY;
+         break;
+      case nir_op_bany_fnequal3:
+      case nir_op_bany_inequal3:
+         tmp.writemask = WRITEMASK_XYZ;
+         break;
+      case nir_op_bany_fnequal4:
+      case nir_op_bany_inequal4:
+         tmp.writemask = WRITEMASK_XYZW;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      emit(CMP(tmp, op[0], op[1],
+               brw_conditional_for_nir_comparison(instr->op)));
+
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   }
+
+   case nir_op_inot:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+      }
+      emit(NOT(dst, op[0]));
+      break;
+
+   case nir_op_ixor:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(XOR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ior:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(OR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_iand:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(AND(dst, op[0], op[1]));
+      break;
+
+   case nir_op_b2i:
+      emit(AND(dst, op[0], src_reg(1)));
+      break;
+
+   case nir_op_b2f:
+      op[0].type = BRW_REGISTER_TYPE_D;
+      dst.type = BRW_REGISTER_TYPE_D;
+      emit(AND(dst, op[0], src_reg(0x3f800000u)));
+      dst.type = BRW_REGISTER_TYPE_F;
+      break;
+
+   case nir_op_f2b:
+      emit(CMP(dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+      break;
+
+   case nir_op_i2b:
+      emit(CMP(dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      break;
+
+   case nir_op_fnoise1_1:
+   case nir_op_fnoise1_2:
+   case nir_op_fnoise1_3:
+   case nir_op_fnoise1_4:
+   case nir_op_fnoise2_1:
+   case nir_op_fnoise2_2:
+   case nir_op_fnoise2_3:
+   case nir_op_fnoise2_4:
+   case nir_op_fnoise3_1:
+   case nir_op_fnoise3_2:
+   case nir_op_fnoise3_3:
+   case nir_op_fnoise3_4:
+   case nir_op_fnoise4_1:
+   case nir_op_fnoise4_2:
+   case nir_op_fnoise4_3:
+   case nir_op_fnoise4_4:
+      unreachable("not reached: should be handled by lower_noise");
+
+   case nir_op_unpack_half_2x16_split_x:
+   case nir_op_unpack_half_2x16_split_y:
+   case nir_op_pack_half_2x16_split:
+      unreachable("not reached: should not occur in vertex shader");
+
+   case nir_op_unpack_snorm_2x16:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_pack_snorm_2x16:
+   case nir_op_pack_unorm_2x16:
+      unreachable("not reached: should be handled by lower_packing_builtins");
+
+   case nir_op_unpack_half_2x16:
+      /* As NIR does not guarantee that we have a correct swizzle outside the
+       * boundaries of a vector, and the implementation of emit_unpack_half_2x16
+       * uses the source operand in an operation with WRITEMASK_Y while our
+       * source operand has only size 1, it accessed incorrect data producing
+       * regressions in Piglit. We repeat the swizzle of the first component on the
+       * rest of components to avoid regressions. In the vec4_visitor IR code path
+       * this is not needed because the operand has already the correct swizzle.
+       */
+      op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle);
+      emit_unpack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_pack_half_2x16:
+      emit_pack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_unpack_unorm_4x8:
+      emit_unpack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_unorm_4x8:
+      emit_pack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_unpack_snorm_4x8:
+      emit_unpack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_snorm_4x8:
+      emit_pack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_bitfield_reverse:
+      emit(BFREV(dst, op[0]));
+      break;
+
+   case nir_op_bit_count:
+      emit(CBIT(dst, op[0]));
+      break;
+
+   case nir_op_ufind_msb:
+   case nir_op_ifind_msb: {
+      src_reg temp = src_reg(this, glsl_type::uint_type);
+
+      inst = emit(FBH(dst_reg(temp), op[0]));
+      inst->dst.writemask = WRITEMASK_XYZW;
+
+      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
+       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
+       * subtract the result from 31 to convert the MSB count into an LSB count.
+       */
+
+      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
+      temp.swizzle = BRW_SWIZZLE_NOOP;
+      emit(MOV(dst, temp));
+
+      src_reg src_tmp = src_reg(dst);
+      emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
+
+      src_tmp.negate = true;
+      inst = emit(ADD(dst, src_tmp, src_reg(31)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_find_lsb:
+      emit(FBL(dst, op[0]));
+      break;
+
+   case nir_op_ubitfield_extract:
+   case nir_op_ibitfield_extract:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFE(dst, op[2], op[1], op[0]));
+      break;
+
+   case nir_op_bfm:
+      emit(BFI1(dst, op[0], op[1]));
+      break;
+
+   case nir_op_bfi:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFI2(dst, op[0], op[1], op[2]));
+      break;
+
+   case nir_op_bitfield_insert:
+      unreachable("not reached: should be handled by "
+                  "lower_instructions::bitfield_insert_to_bfm_bfi");
+
+   case nir_op_fsign:
+      /* AND(val, 0x80000000) gives the sign bit.
+       *
+       * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+       * zero.
+       */
+      emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
+
+      op[0].type = BRW_REGISTER_TYPE_UD;
+      dst.type = BRW_REGISTER_TYPE_UD;
+      emit(AND(dst, op[0], src_reg(0x80000000u)));
+
+      inst = emit(OR(dst, src_reg(dst), src_reg(0x3f800000u)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      dst.type = BRW_REGISTER_TYPE_F;
+
+      if (instr->dest.saturate) {
+         inst = emit(MOV(dst, src_reg(dst)));
+         inst->saturate = true;
+      }
+      break;
+
+   case nir_op_isign:
+      /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
+       *               -> non-negative val generates 0x00000000.
+       *  Predicated OR sets 1 if val is positive.
+       */
+      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
+      emit(ASR(dst, op[0], src_reg(31)));
+      inst = emit(OR(dst, src_reg(dst), src_reg(1)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case nir_op_ishl:
+      emit(SHL(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ishr:
+      emit(ASR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ushr:
+      emit(SHR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ffma:
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      inst = emit(MAD(dst, op[2], op[1], op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_flrp:
+      inst = emit_lrp(dst, op[0], op[1], op[2]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_bcsel:
+      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case nir_op_fdot2:
+      inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdot3:
+      inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdot4:
+      inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_bany2:
+   case nir_op_bany3:
+   case nir_op_bany4: {
+      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+      tmp.writemask = brw_writemask_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+      emit(CMP(tmp, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+
+      emit(MOV(dst, src_reg(0)));
+      inst = emit(MOV(dst, src_reg(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   }
+
+   case nir_op_fabs:
+   case nir_op_iabs:
+   case nir_op_fneg:
+   case nir_op_ineg:
+   case nir_op_fsat:
+      unreachable("not reached: should be lowered by lower_source mods");
+
+   case nir_op_fdiv:
+      unreachable("not reached: should be lowered by DIV_TO_MUL_RCP in the compiler");
+
+   case nir_op_fmod:
+      unreachable("not reached: should be lowered by MOD_TO_FLOOR in the compiler");
+
+   case nir_op_fsub:
+   case nir_op_isub:
+      unreachable("not reached: should be handled by ir_sub_to_add_neg");
+
+   default:
+      unreachable("Unimplemented ALU operation");
+   }
+
+   /* If we need to do a boolean resolve, replace the result with -(x & 1)
+    * to sign extend the low bit to 0/~0
+    */
+   if (devinfo->gen <= 5 &&
+       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) ==
+       BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+      dst_reg masked = dst_reg(this, glsl_type::int_type);
+      masked.writemask = dst.writemask;
+      emit(AND(masked, src_reg(dst), src_reg(1)));
+      src_reg masked_neg = src_reg(masked);
+      masked_neg.negate = true;
+      emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg));
+   }
+}
+
+void
+vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
+{
+   switch (instr->type) {
+   case nir_jump_break:
+      emit(BRW_OPCODE_BREAK);
+      break;
+
+   case nir_jump_continue:
+      emit(BRW_OPCODE_CONTINUE);
+      break;
+
+   case nir_jump_return:
+      /* fall through */
+   default:
+      unreachable("unknown jump");
+   }
+}
+
+enum ir_texture_opcode
+ir_texture_opcode_for_nir_texop(nir_texop texop)
+{
+   enum ir_texture_opcode op;
+
+   switch (texop) {
+   case nir_texop_lod: op = ir_lod; break;
+   case nir_texop_query_levels: op = ir_query_levels; break;
+   case nir_texop_tex: op = ir_tex; break;
+   case nir_texop_tg4: op = ir_tg4; break;
+   case nir_texop_txb: op = ir_txb; break;
+   case nir_texop_txd: op = ir_txd; break;
+   case nir_texop_txf: op = ir_txf; break;
+   case nir_texop_txf_ms: op = ir_txf_ms; break;
+   case nir_texop_txl: op = ir_txl; break;
+   case nir_texop_txs: op = ir_txs; break;
+   default:
+      unreachable("unknown texture opcode");
+   }
+
+   return op;
+}
+const glsl_type *
+glsl_type_for_nir_alu_type(nir_alu_type alu_type,
+                           unsigned components)
+{
+   switch (alu_type) {
+   case nir_type_float:
+      return glsl_type::vec(components);
+   case nir_type_int:
+      return glsl_type::ivec(components);
+   case nir_type_unsigned:
+      return glsl_type::uvec(components);
+   case nir_type_bool:
+      return glsl_type::bvec(components);
+   default:
+      return glsl_type::error_type;
+   }
+
+   return glsl_type::error_type;
+}
+
+void
+vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
+{
+   unsigned sampler = instr->sampler_index;
+   src_reg sampler_reg = src_reg(sampler);
+   src_reg coordinate;
+   const glsl_type *coord_type = NULL;
+   src_reg shadow_comparitor;
+   src_reg offset_value;
+   src_reg lod, lod2;
+   src_reg sample_index;
+   src_reg mcs;
+
+   const glsl_type *dest_type =
+      glsl_type_for_nir_alu_type(instr->dest_type,
+                                 nir_tex_instr_dest_size(instr));
+   dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
+
+   /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
+    * emitting anything other than setting up the constant result.
+    */
+   if (instr->op == nir_texop_tg4) {
+      int swiz = GET_SWZ(key->tex.swizzles[sampler], instr->component);
+      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
+         emit(MOV(dest, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
+         return;
+      }
+   }
+
+   /* Load the texture operation sources */
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_comparitor:
+         shadow_comparitor = get_nir_src(instr->src[i].src,
+                                         BRW_REGISTER_TYPE_F, 1);
+         break;
+
+      case nir_tex_src_coord: {
+         unsigned src_size = nir_tex_instr_src_size(instr, i);
+
+         switch (instr->op) {
+         case nir_texop_txf:
+         case nir_texop_txf_ms:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D,
+                                     src_size);
+            coord_type = glsl_type::ivec(src_size);
+            break;
+
+         default:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                                     src_size);
+            coord_type = glsl_type::vec(src_size);
+            break;
+         }
+         break;
+      }
+
+      case nir_tex_src_ddx:
+         lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_ddy:
+         lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_lod:
+         switch (instr->op) {
+         case nir_texop_txs:
+         case nir_texop_txf:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+            break;
+
+         default:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1);
+            break;
+         }
+         break;
+
+      case nir_tex_src_ms_index: {
+         sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+         assert(coord_type != NULL);
+         if (devinfo->gen >= 7 &&
+             key->tex.compressed_multisample_layout_mask & (1<<sampler)) {
+            mcs = emit_mcs_fetch(coord_type, coordinate, sampler_reg);
+         } else {
+            mcs = src_reg(0u);
+         }
+         mcs = retype(mcs, BRW_REGISTER_TYPE_UD);
+         break;
+      }
+
+      case nir_tex_src_offset:
+         offset_value = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
+         break;
+
+      case nir_tex_src_sampler_offset: {
+         /* The highest sampler which may be used by this operation is
+          * the last element of the array. Mark it here, because the generator
+          * doesn't have enough information to determine the bound.
+          */
+         uint32_t array_size = instr->sampler_array_size;
+         uint32_t max_used = sampler + array_size - 1;
+         if (instr->op == nir_texop_tg4) {
+            max_used += prog_data->base.binding_table.gather_texture_start;
+         } else {
+            max_used += prog_data->base.binding_table.texture_start;
+         }
+
+         brw_mark_surface_used(&prog_data->base, max_used);
+
+         /* Emit code to evaluate the actual indexing expression */
+         src_reg src = get_nir_src(instr->src[i].src, 1);
+         src_reg temp(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(temp), src, src_reg(sampler)));
+         sampler_reg = emit_uniformize(temp);
+         break;
+      }
+
+      case nir_tex_src_projector:
+         unreachable("Should be lowered by do_lower_texture_projection");
+
+      case nir_tex_src_bias:
+         unreachable("LOD bias is not valid for vertex shaders.\n");
+
+      default:
+         unreachable("unknown texture source");
+      }
+   }
+
+   uint32_t constant_offset = 0;
+   for (unsigned i = 0; i < 3; i++) {
+      if (instr->const_offset[i] != 0) {
+         constant_offset = brw_texture_offset(instr->const_offset, 3);
+         break;
+      }
+   }
+
+   /* Stuff the channel select bits in the top of the texture offset */
+   if (instr->op == nir_texop_tg4)
+      constant_offset |= gather_channel(instr->component, sampler) << 16;
+
+   ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op);
+
+   bool is_cube_array =
+      instr->op == nir_texop_txs &&
+      instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
+      instr->is_array;
+
+   emit_texture(op, dest, dest_type, coordinate, instr->coord_components,
+                shadow_comparitor,
+                lod, lod2, sample_index,
+                constant_offset, offset_value,
+                mcs, is_cube_array, sampler, sampler_reg);
+}
+
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index 555c42e2f24..617c9889cad 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -280,15 +280,15 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
     */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (unsigned int i = 0; i < 3; i++) {
-	 if (inst->src[i].file == GRF) {
-	    spill_costs[inst->src[i].reg] += loop_scale;
+         if (inst->src[i].file == GRF) {
+            spill_costs[inst->src[i].reg] += loop_scale;
             if (inst->src[i].reladdr)
                no_spill[inst->src[i].reg] = true;
-	 }
+         }
       }
 
       if (inst->dst.file == GRF) {
-	 spill_costs[inst->dst.reg] += loop_scale;
+         spill_costs[inst->dst.reg] += loop_scale;
          if (inst->dst.reladdr)
             no_spill[inst->dst.reg] = true;
       }
@@ -296,12 +296,12 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
       switch (inst->opcode) {
 
       case BRW_OPCODE_DO:
-	 loop_scale *= 10;
-	 break;
+         loop_scale *= 10;
+         break;
 
       case BRW_OPCODE_WHILE:
-	 loop_scale /= 10;
-	 break;
+         loop_scale /= 10;
+         break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
@@ -309,12 +309,12 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
             if (inst->src[i].file == GRF)
                no_spill[inst->src[i].reg] = true;
          }
-	 if (inst->dst.file == GRF)
-	    no_spill[inst->dst.reg] = true;
-	 break;
+         if (inst->dst.file == GRF)
+            no_spill[inst->dst.reg] = true;
+         break;
 
       default:
-	 break;
+         break;
       }
    }
 }
@@ -339,7 +339,7 @@ void
 vec4_visitor::spill_reg(int spill_reg_nr)
 {
    assert(alloc.sizes[spill_reg_nr] == 1);
-   unsigned int spill_offset = c->last_scratch++;
+   unsigned int spill_offset = last_scratch++;
 
    /* Generate spill/unspill instructions for the objects being spilled. */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 236fa51f92c..20b628e9192 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -287,7 +287,7 @@ vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements
 }
 
 src_reg
-vec4_visitor::fix_3src_operand(src_reg src)
+vec4_visitor::fix_3src_operand(const src_reg &src)
 {
    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
     * able to use vertical stride of zero to replicate the vec4 uniform, like
@@ -313,7 +313,20 @@ vec4_visitor::fix_3src_operand(src_reg src)
 }
 
 src_reg
-vec4_visitor::fix_math_operand(src_reg src)
+vec4_visitor::resolve_source_modifiers(const src_reg &src)
+{
+   if (!src.abs && !src.negate)
+      return src;
+
+   dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
+   resolved.type = src.type;
+   emit(MOV(resolved, src));
+
+   return src_reg(resolved);
+}
+
+src_reg
+vec4_visitor::fix_math_operand(const src_reg &src)
 {
    if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
       return src;
@@ -338,7 +351,7 @@ vec4_visitor::fix_math_operand(src_reg src)
    return src_reg(expanded);
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_math(enum opcode opcode,
                         const dst_reg &dst,
                         const src_reg &src0, const src_reg &src1)
@@ -350,11 +363,13 @@ vec4_visitor::emit_math(enum opcode opcode,
       /* MATH on Gen6 must be align1, so we can't do writemasks. */
       math->dst = dst_reg(this, glsl_type::vec4_type);
       math->dst.type = dst.type;
-      emit(MOV(dst, src_reg(math->dst)));
+      math = emit(MOV(dst, src_reg(math->dst)));
    } else if (devinfo->gen < 6) {
       math->base_mrf = 1;
       math->mlen = src1.file == BAD_FILE ? 1 : 2;
    }
+
+   return math;
 }
 
 void
@@ -572,9 +587,18 @@ vec4_visitor::visit_instructions(const exec_list *list)
    }
 }
 
-
-static int
-type_size(const struct glsl_type *type)
+/**
+ * Returns the minimum number of vec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single vec4); for matrices, the
+ * number of columns; for array and struct, the sum of the vec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ */
+int
+vec4_visitor::type_size(const struct glsl_type *type)
 {
    unsigned int i;
    int size;
@@ -603,6 +627,9 @@ type_size(const struct glsl_type *type)
 	 size += type_size(type->fields.structure[i].type);
       }
       return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
+
    case GLSL_TYPE_SAMPLER:
       /* Samplers take up no register space, since they're baked in at
        * link time.
@@ -611,6 +638,7 @@ type_size(const struct glsl_type *type)
    case GLSL_TYPE_ATOMIC_UINT:
       return 0;
    case GLSL_TYPE_IMAGE:
+      return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_DOUBLE:
    case GLSL_TYPE_ERROR:
@@ -627,7 +655,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type));
+   this->reg = v->alloc.allocate(v->type_size(type));
 
    if (type->is_array() || type->is_record()) {
       this->swizzle = BRW_SWIZZLE_NOOP;
@@ -645,7 +673,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type) * size);
+   this->reg = v->alloc.allocate(v->type_size(type) * size);
 
    this->swizzle = BRW_SWIZZLE_NOOP;
 
@@ -657,7 +685,7 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type));
+   this->reg = v->alloc.allocate(v->type_size(type));
 
    if (type->is_array() || type->is_record()) {
       this->writemask = WRITEMASK_XYZW;
@@ -668,6 +696,21 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    this->type = brw_type_for_base_type(type);
 }
 
+void
+vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
+                                          unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      stage_prog_data->param[4 * uniforms + i] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      stage_prog_data->param[4 * uniforms + i] = &zero;
+
+   uniform_vector_size[uniforms++] = n;
+}
+
 /* Our support for uniforms is piggy-backed on the struct
  * gl_fragment_program, because that's where the values actually
  * get stored, rather than in some global gl_shader_program uniform
@@ -697,26 +740,13 @@ vec4_visitor::setup_uniform_values(ir_variable *ir)
          continue;
       }
 
-      gl_constant_value *components = storage->storage;
-      unsigned vector_count = (MAX2(storage->array_elements, 1) *
-                               storage->type->matrix_columns);
+      const unsigned vector_count = (MAX2(storage->array_elements, 1) *
+                                     storage->type->matrix_columns);
+      const unsigned vector_size = storage->type->vector_elements;
 
-      for (unsigned s = 0; s < vector_count; s++) {
-         assert(uniforms < uniform_array_size);
-         uniform_vector_size[uniforms] = storage->type->vector_elements;
-
-         int i;
-         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
-            stage_prog_data->param[uniforms * 4 + i] = components;
-            components++;
-         }
-         for (; i < 4; i++) {
-            static gl_constant_value zero = { 0.0 };
-            stage_prog_data->param[uniforms * 4 + i] = &zero;
-         }
-
-         uniforms++;
-      }
+      for (unsigned s = 0; s < vector_count; s++)
+         setup_vector_uniform_values(&storage->storage[s * vector_size],
+                                     vector_size);
    }
 }
 
@@ -1043,8 +1073,6 @@ vec4_visitor::visit(ir_variable *ir)
       for (int i = 0; i < type_size(ir->type); i++) {
 	 output_reg[ir->data.location + i] = *reg;
 	 output_reg[ir->data.location + i].reg_offset = i;
-	 output_reg[ir->data.location + i].type =
-            brw_type_for_base_type(ir->type->get_scalar_type());
 	 output_reg_annotation[ir->data.location + i] = ir->name;
       }
       break;
@@ -1064,7 +1092,7 @@ vec4_visitor::visit(ir_variable *ir)
        * Some uniforms, such as samplers and atomic counters, have no actual
        * storage, so we should ignore them.
        */
-      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
+      if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
          return;
 
       /* Track how big the whole uniform variable is, in case we need to put a
@@ -1081,7 +1109,7 @@ vec4_visitor::visit(ir_variable *ir)
       break;
 
    case ir_var_system_value:
-      reg = make_reg_for_system_value(ir);
+      reg = make_reg_for_system_value(ir->data.location, ir->type);
       break;
 
    default:
@@ -1253,7 +1281,7 @@ vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
    return true;
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
                           src_reg src0, src_reg src1)
 {
@@ -1268,9 +1296,11 @@ vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
       inst->predicate = BRW_PREDICATE_NORMAL;
    }
+
+   return inst;
 }
 
-void
+vec4_instruction *
 vec4_visitor::emit_lrp(const dst_reg &dst,
                        const src_reg &x, const src_reg &y, const src_reg &a)
 {
@@ -1278,8 +1308,8 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
       /* Note that the instruction's argument order is reversed from GLSL
        * and the IR.
        */
-      emit(LRP(dst,
-               fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
+     return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
+                     fix_3src_operand(x)));
    } else {
       /* Earlier generations don't support three source operations, so we
        * need to emit x*(1-a) + y*a.
@@ -1294,7 +1324,7 @@ vec4_visitor::emit_lrp(const dst_reg &dst,
       emit(MUL(y_times_a, y, a));
       emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
       emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
-      emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
+      return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
    }
 }
 
@@ -1375,15 +1405,19 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
       emit(pull);
 }
 
-void
-vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
+src_reg
+vec4_visitor::emit_uniformize(const src_reg &src)
 {
    const src_reg chan_index(this, glsl_type::uint_type);
+   const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
+                              src.type);
 
    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
       ->force_writemask_all = true;
    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
       ->force_writemask_all = true;
+
+   return src_reg(dst);
 }
 
 void
@@ -1555,6 +1589,10 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_unop_noise:
       unreachable("not reached: should be handled by lower_noise");
 
+   case ir_unop_subroutine_to_int:
+      emit(MOV(result_dst, op[0]));
+      break;
+
    case ir_binop_add:
       emit(ADD(result_dst, op[0], op[1]));
       break;
@@ -1602,20 +1640,13 @@ vec4_visitor::visit(ir_expression *ir)
       assert(ir->type->is_integer());
       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
       break;
-   case ir_binop_carry: {
-      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
 
-      emit(ADDC(dst_null_ud(), op[0], op[1]));
-      emit(MOV(result_dst, src_reg(acc)));
-      break;
-   }
-   case ir_binop_borrow: {
-      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+   case ir_binop_carry:
+      unreachable("Should have been lowered by carry_to_arith().");
+
+   case ir_binop_borrow:
+      unreachable("Should have been lowered by borrow_to_arith().");
 
-      emit(SUBB(dst_null_ud(), op[0], op[1]));
-      emit(MOV(result_dst, src_reg(acc)));
-      break;
-   }
    case ir_binop_mod:
       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
       assert(ir->type->is_integer());
@@ -1734,16 +1765,11 @@ vec4_visitor::visit(ir_expression *ir)
       emit(MOV(result_dst, op[0]));
       break;
    case ir_unop_b2i:
-      emit(AND(result_dst, op[0], src_reg(1)));
-      break;
    case ir_unop_b2f:
       if (devinfo->gen <= 5) {
          resolve_bool_comparison(ir->operands[0], &op[0]);
       }
-      op[0].type = BRW_REGISTER_TYPE_D;
-      result_dst.type = BRW_REGISTER_TYPE_D;
-      emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
-      result_dst.type = BRW_REGISTER_TYPE_F;
+      emit(MOV(result_dst, negate(op[0])));
       break;
    case ir_unop_f2b:
       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
@@ -1839,7 +1865,7 @@ vec4_visitor::visit(ir_expression *ir)
          surf_index = src_reg(this, glsl_type::uint_type);
          emit(ADD(dst_reg(surf_index), op[0],
                   src_reg(prog_data->base.binding_table.ubo_start)));
-         emit_uniformize(dst_reg(surf_index), surf_index);
+         surf_index = emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -2439,6 +2465,8 @@ vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
                           src_reg(), src_reg());
    }
+
+   brw_mark_surface_used(stage_prog_data, surf_index);
 }
 
 void
@@ -2456,7 +2484,8 @@ vec4_visitor::visit(ir_call *ir)
 }
 
 src_reg
-vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
+vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
+                             src_reg coordinate, src_reg sampler)
 {
    vec4_instruction *inst =
       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
@@ -2483,21 +2512,21 @@ vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler
    }
 
    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
-   int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
+   int coord_mask = (1 << coordinate_type->vector_elements) - 1;
    int zero_mask = 0xf & ~coord_mask;
 
-   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
             coordinate));
 
-   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
             src_reg(0)));
 
    emit(inst);
    return src_reg(inst->dst);
 }
 
-static bool
-is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
+bool
+vec4_visitor::is_high_sampler(src_reg sampler)
 {
    if (devinfo->gen < 8 && !devinfo->is_haswell)
       return false;
@@ -2505,6 +2534,183 @@ is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
 }
 
+void
+vec4_visitor::emit_texture(ir_texture_opcode op,
+                           dst_reg dest,
+                           const glsl_type *dest_type,
+                           src_reg coordinate,
+                           int coord_components,
+                           src_reg shadow_comparitor,
+                           src_reg lod, src_reg lod2,
+                           src_reg sample_index,
+                           uint32_t constant_offset,
+                           src_reg offset_value,
+                           src_reg mcs,
+                           bool is_cube_array,
+                           uint32_t sampler,
+                           src_reg sampler_reg)
+{
+   enum opcode opcode;
+   switch (op) {
+   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
+   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
+   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
+   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
+   case ir_tg4: opcode = offset_value.file != BAD_FILE
+                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
+   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
+   case ir_txb:
+      unreachable("TXB is not valid for vertex shaders.");
+   case ir_lod:
+      unreachable("LOD is not valid for vertex shaders.");
+   default:
+      unreachable("Unrecognized tex op");
+   }
+
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
+      opcode, dst_reg(this, dest_type));
+
+   inst->offset = constant_offset;
+
+   /* The message header is necessary for:
+    * - Gen4 (always)
+    * - Gen9+ for selecting SIMD4x2
+    * - Texel offsets
+    * - Gather channel selection
+    * - Sampler indices too large to fit in a 4-bit value.
+    */
+   inst->header_size =
+      (devinfo->gen < 5 || devinfo->gen >= 9 ||
+       inst->offset != 0 || op == ir_tg4 ||
+       is_high_sampler(sampler_reg)) ? 1 : 0;
+   inst->base_mrf = 2;
+   inst->mlen = inst->header_size + 1; /* always at least one */
+   inst->dst.writemask = WRITEMASK_XYZW;
+   inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
+
+   inst->src[1] = sampler_reg;
+
+   /* MRF for the first parameter */
+   int param_base = inst->base_mrf + inst->header_size;
+
+   if (op == ir_txs || op == ir_query_levels) {
+      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
+   } else {
+      /* Load the coordinate */
+      /* FINISHME: gl_clamp_mask and saturate */
+      int coord_mask = (1 << coord_components) - 1;
+      int zero_mask = 0xf & ~coord_mask;
+
+      emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
+               coordinate));
+
+      if (zero_mask != 0) {
+         emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
+                  src_reg(0)));
+      }
+      /* Load the shadow comparitor */
+      if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
+	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
+			  WRITEMASK_X),
+		  shadow_comparitor));
+	 inst->mlen++;
+      }
+
+      /* Load the LOD info */
+      if (op == ir_tex || op == ir_txl) {
+	 int mrf, writemask;
+	 if (devinfo->gen >= 5) {
+	    mrf = param_base + 1;
+	    if (shadow_comparitor.file != BAD_FILE) {
+	       writemask = WRITEMASK_Y;
+	       /* mlen already incremented */
+	    } else {
+	       writemask = WRITEMASK_X;
+	       inst->mlen++;
+	    }
+	 } else /* devinfo->gen == 4 */ {
+	    mrf = param_base;
+	    writemask = WRITEMASK_W;
+	 }
+         lod.swizzle = BRW_SWIZZLE_XXXX;
+	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
+      } else if (op == ir_txf) {
+         emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
+      } else if (op == ir_txf_ms) {
+         emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
+                  sample_index));
+         if (devinfo->gen >= 7) {
+            /* MCS data is in the first channel of `mcs`, but we need to get it into
+             * the .y channel of the second vec4 of params, so replicate .x across
+             * the whole vec4 and then mask off everything except .y
+             */
+            mcs.swizzle = BRW_SWIZZLE_XXXX;
+            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
+                     mcs));
+         }
+         inst->mlen++;
+      } else if (op == ir_txd) {
+         const brw_reg_type type = lod.type;
+
+	 if (devinfo->gen >= 5) {
+	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
+	    inst->mlen++;
+
+	    if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
+	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
+	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
+	       inst->mlen++;
+
+               if (shadow_comparitor.file != BAD_FILE) {
+                  emit(MOV(dst_reg(MRF, param_base + 2,
+                                   shadow_comparitor.type, WRITEMASK_Z),
+                           shadow_comparitor));
+               }
+	    }
+	 } else /* devinfo->gen == 4 */ {
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
+	    inst->mlen += 2;
+	 }
+      } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
+         if (shadow_comparitor.file != BAD_FILE) {
+            emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
+                     shadow_comparitor));
+         }
+
+         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
+                  offset_value));
+         inst->mlen++;
+      }
+   }
+
+   emit(inst);
+
+   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
+    * spec requires layers.
+    */
+   if (op == ir_txs && is_cube_array) {
+      emit_math(SHADER_OPCODE_INT_QUOTIENT,
+                writemask(inst->dst, WRITEMASK_Z),
+                src_reg(inst->dst), src_reg(6));
+   }
+
+   if (devinfo->gen == 6 && op == ir_tg4) {
+      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
+   }
+
+   swizzle_result(op, dest,
+                  src_reg(inst->dst), sampler, dest_type);
+}
+
 void
 vec4_visitor::visit(ir_texture *ir)
 {
@@ -2535,11 +2741,9 @@ vec4_visitor::visit(ir_texture *ir)
 
       /* Emit code to evaluate the actual indexing expression */
       nonconst_sampler_index->accept(this);
-      dst_reg temp(this, glsl_type::uint_type);
-      emit(ADD(temp, this->result, src_reg(sampler)));
-      emit_uniformize(temp, src_reg(temp));
-
-      sampler_reg = src_reg(temp);
+      src_reg temp(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
+      sampler_reg = emit_uniformize(temp);
    } else {
       /* Single sampler, or constant array index; the indexing expression
        * is just an immediate.
@@ -2572,7 +2776,9 @@ vec4_visitor::visit(ir_texture *ir)
     * generating these values may involve SEND messages that need the MRFs.
     */
    src_reg coordinate;
+   int coord_components = 0;
    if (ir->coordinate) {
+      coord_components = ir->coordinate->type->vector_elements;
       ir->coordinate->accept(this);
       coordinate = this->result;
    }
@@ -2590,42 +2796,35 @@ vec4_visitor::visit(ir_texture *ir)
       offset_value = src_reg(this->result);
    }
 
-   const glsl_type *lod_type = NULL, *sample_index_type = NULL;
-   src_reg lod, dPdx, dPdy, sample_index, mcs;
+   src_reg lod, lod2, sample_index, mcs;
    switch (ir->op) {
    case ir_tex:
       lod = src_reg(0.0f);
-      lod_type = glsl_type::float_type;
       break;
    case ir_txf:
    case ir_txl:
    case ir_txs:
       ir->lod_info.lod->accept(this);
       lod = this->result;
-      lod_type = ir->lod_info.lod->type;
       break;
    case ir_query_levels:
       lod = src_reg(0);
-      lod_type = glsl_type::int_type;
       break;
    case ir_txf_ms:
       ir->lod_info.sample_index->accept(this);
       sample_index = this->result;
-      sample_index_type = ir->lod_info.sample_index->type;
 
       if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
-         mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
+         mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
       else
          mcs = src_reg(0u);
       break;
    case ir_txd:
       ir->lod_info.grad.dPdx->accept(this);
-      dPdx = this->result;
+      lod = this->result;
 
       ir->lod_info.grad.dPdy->accept(this);
-      dPdy = this->result;
-
-      lod_type = ir->lod_info.grad.dPdx->type;
+      lod2 = this->result;
       break;
    case ir_txb:
    case ir_lod:
@@ -2633,175 +2832,31 @@ vec4_visitor::visit(ir_texture *ir)
       break;
    }
 
-   enum opcode opcode;
-   switch (ir->op) {
-   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_tg4: opcode = has_nonconstant_offset
-                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
-   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txb:
-      unreachable("TXB is not valid for vertex shaders.");
-   case ir_lod:
-      unreachable("LOD is not valid for vertex shaders.");
-   default:
-      unreachable("Unrecognized tex op");
-   }
-
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
-      opcode, dst_reg(this, ir->type));
-
+   uint32_t constant_offset = 0;
    if (ir->offset != NULL && !has_nonconstant_offset) {
-      inst->offset =
+      constant_offset  =
          brw_texture_offset(ir->offset->as_constant()->value.i,
                             ir->offset->type->vector_elements);
    }
 
    /* Stuff the channel select bits in the top of the texture offset */
    if (ir->op == ir_tg4)
-      inst->offset |= gather_channel(ir, sampler) << 16;
+      constant_offset |=
+         gather_channel( ir->lod_info.component->as_constant()->value.i[0],
+                         sampler) << 16;
 
-   /* The message header is necessary for:
-    * - Gen4 (always)
-    * - Gen9+ for selecting SIMD4x2
-    * - Texel offsets
-    * - Gather channel selection
-    * - Sampler indices too large to fit in a 4-bit value.
-    */
-   inst->header_size =
-      (devinfo->gen < 5 || devinfo->gen >= 9 ||
-       inst->offset != 0 || ir->op == ir_tg4 ||
-       is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
-   inst->base_mrf = 2;
-   inst->mlen = inst->header_size + 1; /* always at least one */
-   inst->dst.writemask = WRITEMASK_XYZW;
-   inst->shadow_compare = ir->shadow_comparitor != NULL;
+   glsl_type const *type = ir->sampler->type;
+   bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+      type->sampler_array;
 
-   inst->src[1] = sampler_reg;
+   this->result = src_reg(this, ir->type);
+   dst_reg dest = dst_reg(this->result);
 
-   /* MRF for the first parameter */
-   int param_base = inst->base_mrf + inst->header_size;
-
-   if (ir->op == ir_txs || ir->op == ir_query_levels) {
-      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
-      emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
-   } else {
-      /* Load the coordinate */
-      /* FINISHME: gl_clamp_mask and saturate */
-      int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
-      int zero_mask = 0xf & ~coord_mask;
-
-      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
-               coordinate));
-
-      if (zero_mask != 0) {
-         emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
-                  src_reg(0)));
-      }
-      /* Load the shadow comparitor */
-      if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
-	 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
-			  WRITEMASK_X),
-		  shadow_comparitor));
-	 inst->mlen++;
-      }
-
-      /* Load the LOD info */
-      if (ir->op == ir_tex || ir->op == ir_txl) {
-	 int mrf, writemask;
-	 if (devinfo->gen >= 5) {
-	    mrf = param_base + 1;
-	    if (ir->shadow_comparitor) {
-	       writemask = WRITEMASK_Y;
-	       /* mlen already incremented */
-	    } else {
-	       writemask = WRITEMASK_X;
-	       inst->mlen++;
-	    }
-	 } else /* devinfo->gen == 4 */ {
-	    mrf = param_base;
-	    writemask = WRITEMASK_W;
-	 }
-	 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
-      } else if (ir->op == ir_txf) {
-         emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
-      } else if (ir->op == ir_txf_ms) {
-         emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
-                  sample_index));
-         if (devinfo->gen >= 7) {
-            /* MCS data is in the first channel of `mcs`, but we need to get it into
-             * the .y channel of the second vec4 of params, so replicate .x across
-             * the whole vec4 and then mask off everything except .y
-             */
-            mcs.swizzle = BRW_SWIZZLE_XXXX;
-            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
-                     mcs));
-         }
-         inst->mlen++;
-      } else if (ir->op == ir_txd) {
-	 const glsl_type *type = lod_type;
-
-	 if (devinfo->gen >= 5) {
-	    dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
-	    dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
-	    inst->mlen++;
-
-	    if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
-	       dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
-	       dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
-	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
-	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
-	       inst->mlen++;
-
-               if (ir->shadow_comparitor) {
-                  emit(MOV(dst_reg(MRF, param_base + 2,
-                                   ir->shadow_comparitor->type, WRITEMASK_Z),
-                           shadow_comparitor));
-               }
-	    }
-	 } else /* devinfo->gen == 4 */ {
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
-	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
-	    inst->mlen += 2;
-	 }
-      } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
-         if (ir->shadow_comparitor) {
-            emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
-                     shadow_comparitor));
-         }
-
-         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
-                  offset_value));
-         inst->mlen++;
-      }
-   }
-
-   emit(inst);
-
-   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
-    * spec requires layers.
-    */
-   if (ir->op == ir_txs) {
-      glsl_type const *type = ir->sampler->type;
-      if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-          type->sampler_array) {
-         emit_math(SHADER_OPCODE_INT_QUOTIENT,
-                   writemask(inst->dst, WRITEMASK_Z),
-                   src_reg(inst->dst), src_reg(6));
-      }
-   }
-
-   if (devinfo->gen == 6 && ir->op == ir_tg4) {
-      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
-   }
-
-   swizzle_result(ir, src_reg(inst->dst), sampler);
+   emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
+                shadow_comparitor,
+                lod, lod2, sample_index,
+                constant_offset, offset_value,
+                mcs, is_cube_array, sampler, sampler_reg);
 }
 
 /**
@@ -2835,10 +2890,9 @@ vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
  * Set up the gather channel based on the swizzle, for gather4.
  */
 uint32_t
-vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
+vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
 {
-   ir_constant *chan = ir->lod_info.component->as_constant();
-   int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
+   int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
    switch (swiz) {
       case SWIZZLE_X: return 0;
       case SWIZZLE_Y:
@@ -2856,22 +2910,23 @@ vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
 }
 
 void
-vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
+vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
+                             src_reg orig_val, uint32_t sampler,
+                             const glsl_type *dest_type)
 {
    int s = key->tex.swizzles[sampler];
 
-   this->result = src_reg(this, ir->type);
-   dst_reg swizzled_result(this->result);
+   dst_reg swizzled_result = dest;
 
-   if (ir->op == ir_query_levels) {
+   if (op == ir_query_levels) {
       /* # levels is in .w */
       orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
       emit(MOV(swizzled_result, orig_val));
       return;
    }
 
-   if (ir->op == ir_txs || ir->type == glsl_type::float_type
-			|| s == SWIZZLE_NOOP || ir->op == ir_tg4) {
+   if (op == ir_txs || dest_type == glsl_type::float_type
+			|| s == SWIZZLE_NOOP || op == ir_tg4) {
       emit(MOV(swizzled_result, orig_val));
       return;
    }
@@ -2953,12 +3008,25 @@ vec4_visitor::visit(ir_if *ir)
    emit(BRW_OPCODE_ENDIF);
 }
 
+void
+vec4_visitor::gs_emit_vertex(int stream_id)
+{
+   unreachable("not reached");
+}
+
 void
 vec4_visitor::visit(ir_emit_vertex *)
 {
    unreachable("not reached");
 }
 
+void
+vec4_visitor::gs_end_primitive()
+{
+   unreachable("not reached");
+}
+
+
 void
 vec4_visitor::visit(ir_end_primitive *)
 {
@@ -3094,6 +3162,7 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
          vec4_instruction *inst;
          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
          inst->predicate = BRW_PREDICATE_NORMAL;
+         output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
@@ -3106,18 +3175,23 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
          dst_reg reg_w = reg;
          reg_w.writemask = WRITEMASK_W;
-         emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
+         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
+         reg_as_src.type = reg_w.type;
+         reg_as_src.swizzle = brw_swizzle_for_size(1);
+         emit(MOV(reg_w, reg_as_src));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
          dst_reg reg_y = reg;
          reg_y.writemask = WRITEMASK_Y;
          reg_y.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
          dst_reg reg_z = reg;
          reg_z.writemask = WRITEMASK_Z;
          reg_z.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
       }
    }
@@ -3155,8 +3229,8 @@ vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
 vec4_instruction *
 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
 {
-   assert (varying < VARYING_SLOT_MAX);
-   reg.type = output_reg[varying].type;
+   assert(varying < VARYING_SLOT_MAX);
+   assert(output_reg[varying].type == reg.type);
    current_annotation = output_reg_annotation[varying];
    /* Copy the register, saturating if necessary */
    return emit(MOV(reg, src_reg(output_reg[varying])));
@@ -3166,6 +3240,7 @@ void
 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
 {
    reg.type = BRW_REGISTER_TYPE_F;
+   output_reg[varying].type = reg.type;
 
    switch (varying) {
    case VARYING_SLOT_PSIZ:
@@ -3422,7 +3497,8 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
 				       inst->dst.writemask));
    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
-   write->predicate = inst->predicate;
+   if (inst->opcode != BRW_OPCODE_SEL)
+      write->predicate = inst->predicate;
    write->ir = inst->ir;
    write->annotation = inst->annotation;
    inst->insert_after(block, write);
@@ -3485,16 +3561,16 @@ vec4_visitor::move_grf_array_access_to_scratch()
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       if (inst->dst.file == GRF && inst->dst.reladdr) {
          if (scratch_loc[inst->dst.reg] == -1) {
-            scratch_loc[inst->dst.reg] = c->last_scratch;
-            c->last_scratch += this->alloc.sizes[inst->dst.reg];
+            scratch_loc[inst->dst.reg] = last_scratch;
+            last_scratch += this->alloc.sizes[inst->dst.reg];
          }
 
          for (src_reg *iter = inst->dst.reladdr;
               iter->reladdr;
               iter = iter->reladdr) {
             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = c->last_scratch;
-               c->last_scratch += this->alloc.sizes[iter->reg];
+               scratch_loc[iter->reg] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->reg];
             }
          }
       }
@@ -3504,8 +3580,8 @@ vec4_visitor::move_grf_array_access_to_scratch()
               iter->reladdr;
               iter = iter->reladdr) {
             if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = c->last_scratch;
-               c->last_scratch += this->alloc.sizes[iter->reg];
+               scratch_loc[iter->reg] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->reg];
             }
          }
       }
@@ -3679,7 +3755,7 @@ vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
 }
 
 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
-                           struct brw_vec4_compile *c,
+                           void *log_data,
                            struct gl_program *prog,
                            const struct brw_vue_prog_key *key,
                            struct brw_vue_prog_data *prog_data,
@@ -3688,9 +3764,8 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
 			   void *mem_ctx,
                            bool no_spills,
                            int shader_time_index)
-   : backend_shader(compiler, NULL, mem_ctx,
+   : backend_shader(compiler, log_data, mem_ctx,
                     shader_prog, prog, &prog_data->base, stage),
-     c(c),
      key(key),
      prog_data(prog_data),
      sanity_param_count(0),
@@ -3698,7 +3773,8 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
      first_non_payload_grf(0),
      need_all_constants_in_pull_buffer(false),
      no_spills(no_spills),
-     shader_time_index(shader_time_index)
+     shader_time_index(shader_time_index),
+     last_scratch(0)
 {
    this->failed = false;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
index dcbd2405078..d1a72d787e7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
@@ -394,8 +394,7 @@ vec4_vs_visitor::emit_program_code()
     * pull constants.  Do that now.
     */
    if (this->need_all_constants_in_pull_buffer) {
-      const struct gl_program_parameter_list *params =
-         vs_compile->vp->program.Base.Parameters;
+      const struct gl_program_parameter_list *params = vp->Base.Parameters;
       unsigned i;
       for (i = 0; i < params->NumParameters * 4; i++) {
          stage_prog_data->pull_param[i] =
@@ -415,8 +414,7 @@ vec4_vs_visitor::setup_vp_regs()
       vp_temp_regs[i] = src_reg(this, glsl_type::vec4_type);
 
    /* PROGRAM_STATE_VAR etc. */
-   struct gl_program_parameter_list *plist =
-      vs_compile->vp->program.Base.Parameters;
+   struct gl_program_parameter_list *plist = vp->Base.Parameters;
    for (unsigned p = 0; p < plist->NumParameters; p++) {
       unsigned components = plist->Parameters[p].Size;
 
@@ -486,8 +484,7 @@ vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
 src_reg
 vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
 {
-   struct gl_program_parameter_list *plist =
-      vs_compile->vp->program.Base.Parameters;
+   struct gl_program_parameter_list *plist = vp->Base.Parameters;
 
    src_reg result;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index f93062b46d0..620f652d6dc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -36,7 +36,7 @@ vec4_vs_visitor::emit_prolog()
 
    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
-         uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
+         uint8_t wa_flags = key->gl_attrib_wa_flags[i];
          dst_reg reg(ATTR, i);
          dst_reg reg_d = reg;
          reg_d.type = BRW_REGISTER_TYPE_D;
@@ -143,7 +143,8 @@ vec4_vs_visitor::emit_prolog()
 
 
 dst_reg *
-vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
+vec4_vs_visitor::make_reg_for_system_value(int location,
+                                           const glsl_type *type)
 {
    /* VertexID is stored by the VF as the last vertex element, but
     * we don't represent it with a flag in inputs_read, so we call
@@ -151,7 +152,7 @@ vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
     */
    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
 
-   switch (ir->data.location) {
+   switch (location) {
    case SYSTEM_VALUE_BASE_VERTEX:
       reg->writemask = WRITEMASK_X;
       vs_prog_data->uses_vertexid = true;
@@ -212,19 +213,22 @@ vec4_vs_visitor::emit_thread_end()
 
 
 vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
-                                 struct brw_vs_compile *vs_compile,
+                                 void *log_data,
+                                 const struct brw_vs_prog_key *key,
                                  struct brw_vs_prog_data *vs_prog_data,
+                                 struct gl_vertex_program *vp,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
                                  int shader_time_index,
                                  bool use_legacy_snorm_formula)
-   : vec4_visitor(compiler, &vs_compile->base, &vs_compile->vp->program.Base,
-                  &vs_compile->key.base, &vs_prog_data->base, prog,
+   : vec4_visitor(compiler, log_data,
+                  &vp->Base, &key->base, &vs_prog_data->base, prog,
                   MESA_SHADER_VERTEX,
                   mem_ctx, false /* no_spills */,
                   shader_time_index),
-     vs_compile(vs_compile),
+     key(key),
      vs_prog_data(vs_prog_data),
+     vp(vp),
      use_legacy_snorm_formula(use_legacy_snorm_formula)
 {
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 6e9848fb1e9..c53cb49b612 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -94,7 +94,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
 {
    GLuint program_size;
    const GLuint *program;
-   struct brw_vs_compile c;
    struct brw_vs_prog_data prog_data;
    struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base;
    void *mem_ctx;
@@ -104,8 +103,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
    if (prog)
       vs = prog->_LinkedShaders[MESA_SHADER_VERTEX];
 
-   memset(&c, 0, sizeof(c));
-   memcpy(&c.key, key, sizeof(*key));
    memset(&prog_data, 0, sizeof(prog_data));
 
    /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
@@ -114,8 +111,6 @@ brw_codegen_vs_prog(struct brw_context *brw,
 
    mem_ctx = ralloc_context(NULL);
 
-   c.vp = vp;
-
    /* Allocate the references to the uniforms that will end up in the
     * prog_data associated with the compiled program, and which will be freed
     * by the state cache.
@@ -126,26 +121,30 @@ brw_codegen_vs_prog(struct brw_context *brw,
        * case being a float value that gets blown up to a vec4, so be
        * conservative here.
        */
-      param_count = vs->num_uniform_components * 4;
-
+      param_count = vs->num_uniform_components * 4 +
+                    vs->NumImages * BRW_IMAGE_PARAM_SIZE;
+      stage_prog_data->nr_image_params = vs->NumImages;
    } else {
       param_count = vp->program.Base.Parameters->NumParameters * 4;
    }
    /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
     * planes as uniforms.
     */
-   param_count += c.key.base.nr_userclip_plane_consts * 4;
+   param_count += key->base.nr_userclip_plane_consts * 4;
 
    stage_prog_data->param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    stage_prog_data->pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   stage_prog_data->image_param =
+      rzalloc_array(NULL, struct brw_image_param,
+                    stage_prog_data->nr_image_params);
    stage_prog_data->nr_params = param_count;
 
    GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
    prog_data.inputs_read = vp->program.Base.InputsRead;
 
-   if (c.key.copy_edgeflag) {
+   if (key->copy_edgeflag) {
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
       prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
    }
@@ -158,7 +157,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
        * coords, which would be a pain to handle.
        */
       for (i = 0; i < 8; i++) {
-         if (c.key.point_coord_replace & (1 << i))
+         if (key->point_coord_replace & (1 << i))
             outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
       }
 
@@ -173,7 +172,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
     * distance varying slots whenever clipping is enabled, even if the vertex
     * shader doesn't write to gl_ClipDistance.
     */
-   if (c.key.base.userclip_active) {
+   if (key->base.userclip_active) {
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
    }
@@ -182,34 +181,28 @@ brw_codegen_vs_prog(struct brw_context *brw,
                        &prog_data.base.vue_map, outputs_written);
 
    if (0) {
-      _mesa_fprint_program_opt(stderr, &c.vp->program.Base, PROG_PRINT_DEBUG,
+      _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
 			       true);
    }
 
    /* Emit GEN4 code.
     */
-   program = brw_vs_emit(brw, prog, &c, &prog_data, mem_ctx, &program_size);
+   program = brw_vs_emit(brw, mem_ctx, key, &prog_data,
+                         &vp->program, prog, &program_size);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
    }
 
    /* Scratch space is used for register spilling */
-   if (c.base.last_scratch) {
-      perf_debug("Vertex shader triggered register spilling.  "
-                 "Try reducing the number of live vec4 values to "
-                 "improve performance.\n");
-
-      prog_data.base.base.total_scratch
-         = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
-
+   if (prog_data.base.base.total_scratch) {
       brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo,
 			 prog_data.base.base.total_scratch *
                          brw->max_vs_threads);
    }
 
    brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG,
-		    &c.key, sizeof(c.key),
+		    key, sizeof(struct brw_vs_prog_key),
 		    program, program_size,
 		    &prog_data, sizeof(prog_data),
 		    &brw->vs.base.prog_offset, &brw->vs.prog_data);
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 61f9b006a58..1d9bee11c56 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -50,22 +50,16 @@
 #define BRW_ATTRIB_WA_SIGN          32  /* interpret as signed in shader */
 #define BRW_ATTRIB_WA_SCALE         64  /* interpret as scaled in shader */
 
-struct brw_vs_compile {
-   struct brw_vec4_compile base;
-   struct brw_vs_prog_key key;
-
-   struct brw_vertex_program *vp;
-};
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 const unsigned *brw_vs_emit(struct brw_context *brw,
-                            struct gl_shader_program *prog,
-                            struct brw_vs_compile *c,
-                            struct brw_vs_prog_data *prog_data,
                             void *mem_ctx,
+                            const struct brw_vs_prog_key *key,
+                            struct brw_vs_prog_data *prog_data,
+                            struct gl_vertex_program *vp,
+                            struct gl_shader_program *shader_prog,
                             unsigned *program_size);
 void brw_vs_debug_recompile(struct brw_context *brw,
                             struct gl_shader_program *prog,
@@ -91,15 +85,18 @@ class vec4_vs_visitor : public vec4_visitor
 {
 public:
    vec4_vs_visitor(const struct brw_compiler *compiler,
-                   struct brw_vs_compile *vs_compile,
+                   void *log_data,
+                   const struct brw_vs_prog_key *key,
                    struct brw_vs_prog_data *vs_prog_data,
+                   struct gl_vertex_program *vp,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
                    int shader_time_index,
                    bool use_legacy_snorm_formula);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
    virtual void setup_payload();
    virtual void emit_prolog();
    virtual void emit_program_code();
@@ -113,8 +110,9 @@ private:
    dst_reg get_vp_dst_reg(const prog_dst_register &dst);
    src_reg get_vp_src_reg(const prog_src_register &src);
 
-   struct brw_vs_compile * const vs_compile;
+   const struct brw_vs_prog_key *const key;
    struct brw_vs_prog_data * const vs_prog_data;
+   struct gl_vertex_program *const vp;
    src_reg *vp_temp_regs;
    src_reg vp_addr_reg;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index b2f91bd412b..72e37d4b467 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -191,3 +191,28 @@ const struct brw_tracked_state brw_vs_abo_surfaces = {
    },
    .emit = brw_upload_vs_abo_surfaces,
 };
+
+static void
+brw_upload_vs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_VERTEX_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
+
+   if (prog) {
+      /* BRW_NEW_VS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX],
+                                &brw->vs.base, &brw->vs.prog_data->base.base);
+   }
+}
+
+const struct brw_tracked_state brw_vs_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_IMAGE_UNITS |
+             BRW_NEW_VERTEX_PROGRAM |
+             BRW_NEW_VS_PROG_DATA,
+   },
+   .emit = brw_upload_vs_image_surfaces,
+};
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 4619ce1080d..41266f57560 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -1,34 +1,28 @@
 /*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keithw@vmware.com>
-  */
-
+ * Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ * Intel funded Tungsten Graphics to
+ * develop this 3D driver.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
 #include "brw_context.h"
 #include "brw_wm.h"
 #include "brw_state.h"
@@ -181,9 +175,12 @@ brw_codegen_wm_prog(struct brw_context *brw,
     * so the shader definitely kills pixels.
     */
    prog_data.uses_kill = fp->program.UsesKill || key->alpha_test_func;
-
+   prog_data.uses_omask =
+      fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    prog_data.computed_depth_mode = computed_depth_mode(&fp->program);
 
+   prog_data.early_fragment_tests = fs && fs->EarlyFragmentTests;
+
    /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
    if (!prog)
       prog_data.base.use_alt_mode = true;
@@ -194,7 +191,9 @@ brw_codegen_wm_prog(struct brw_context *brw,
     */
    int param_count;
    if (fs) {
-      param_count = fs->num_uniform_components;
+      param_count = fs->num_uniform_components +
+                    fs->NumImages * BRW_IMAGE_PARAM_SIZE;
+      prog_data.base.nr_image_params = fs->NumImages;
    } else {
       param_count = fp->program.Base.Parameters->NumParameters * 4;
    }
@@ -204,6 +203,9 @@ brw_codegen_wm_prog(struct brw_context *brw,
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    prog_data.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param,
+                    prog_data.base.nr_image_params);
    prog_data.base.nr_params = param_count;
 
    prog_data.barycentric_interp_modes =
@@ -349,13 +351,15 @@ static uint8_t
 gen6_gather_workaround(GLenum internalformat)
 {
    switch (internalformat) {
-      case GL_R8I: return WA_SIGN | WA_8BIT;
-      case GL_R8UI: return WA_8BIT;
-      case GL_R16I: return WA_SIGN | WA_16BIT;
-      case GL_R16UI: return WA_16BIT;
-      /* note that even though GL_R32I and GL_R32UI have format overrides
-       * in the surface state, there is no shader w/a required */
-      default: return 0;
+   case GL_R8I: return WA_SIGN | WA_8BIT;
+   case GL_R8UI: return WA_8BIT;
+   case GL_R16I: return WA_SIGN | WA_16BIT;
+   case GL_R16UI: return WA_16BIT;
+   default:
+      /* Note that even though GL_R32I and GL_R32UI have format overrides in
+       * the surface state, there is no shader w/a required.
+       */
+      return 0;
    }
 }
 
@@ -402,8 +406,9 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
 	       key->gl_clamp_mask[2] |= 1 << s;
 	 }
 
-         /* gather4's channel select for green from RG32F is broken;
-          * requires a shader w/a on IVB; fixable with just SCS on HSW. */
+         /* gather4's channel select for green from RG32F is broken; requires
+          * a shader w/a on IVB; fixable with just SCS on HSW.
+          */
          if (brw->gen == 7 && !brw->is_haswell && prog->UsesGather) {
             if (img->InternalFormat == GL_RG32F)
                key->gather_channel_quirk_mask |= 1 << s;
@@ -452,13 +457,13 @@ brw_wm_state_dirty (struct brw_context *brw)
                           BRW_NEW_VUE_MAP_GEOM_OUT);
 }
 
-static void brw_wm_populate_key( struct brw_context *brw,
-				 struct brw_wm_prog_key *key )
+static void
+brw_wm_populate_key(struct brw_context *brw, struct brw_wm_prog_key *key)
 {
    struct gl_context *ctx = &brw->ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp =
-      (struct brw_fragment_program *)brw->fragment_program;
+      (struct brw_fragment_program *) brw->fragment_program;
    const struct gl_program *prog = (struct gl_program *) brw->fragment_program;
    GLuint lookup = 0;
    GLuint line_aa;
@@ -604,7 +609,8 @@ static void brw_wm_populate_key( struct brw_context *brw,
     * like GL requires.  Fix that by building the alpha test into the
     * shader, and we'll skip enabling the fixed function alpha test.
     */
-   if (brw->gen < 6 && ctx->DrawBuffer->_NumColorDrawBuffers > 1 && ctx->Color.AlphaEnabled) {
+   if (brw->gen < 6 && ctx->DrawBuffer->_NumColorDrawBuffers > 1 &&
+       ctx->Color.AlphaEnabled) {
       key->alpha_test_func = ctx->Color.AlphaFunc;
       key->alpha_test_ref = ctx->Color.AlphaRef;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 72aad96bb6a..f13a97ce2b0 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1024,6 +1024,257 @@ const struct brw_tracked_state brw_cs_abo_surfaces = {
    .emit = brw_upload_cs_abo_surfaces,
 };
 
+static void
+brw_upload_cs_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* _NEW_PROGRAM */
+   struct gl_shader_program *prog =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
+
+   if (prog) {
+      /* BRW_NEW_CS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_COMPUTE],
+                                &brw->cs.base, &brw->cs.prog_data->base);
+   }
+}
+
+const struct brw_tracked_state brw_cs_image_surfaces = {
+   .dirty = {
+      .mesa = _NEW_PROGRAM,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_CS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS
+   },
+   .emit = brw_upload_cs_image_surfaces,
+};
+
+static uint32_t
+get_image_format(struct brw_context *brw, mesa_format format, GLenum access)
+{
+   if (access == GL_WRITE_ONLY) {
+      return brw_format_for_mesa_format(format);
+   } else {
+      /* Typed surface reads support a very limited subset of the shader
+       * image formats.  Translate it into the closest format the
+       * hardware supports.
+       */
+      if ((_mesa_get_format_bytes(format) >= 16 && brw->gen <= 8) ||
+          (_mesa_get_format_bytes(format) >= 8 &&
+           (brw->gen == 7 && !brw->is_haswell)))
+         return BRW_SURFACEFORMAT_RAW;
+      else
+         return brw_format_for_mesa_format(
+            brw_lower_mesa_image_format(brw->intelScreen->devinfo, format));
+   }
+}
+
+static void
+update_default_image_param(struct brw_context *brw,
+                           struct gl_image_unit *u,
+                           unsigned surface_idx,
+                           struct brw_image_param *param)
+{
+   memset(param, 0, sizeof(*param));
+   param->surface_idx = surface_idx;
+   /* Set the swizzling shifts to all-ones to effectively disable swizzling --
+    * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
+    * detailed explanation of these parameters.
+    */
+   param->swizzling[0] = 0xff;
+   param->swizzling[1] = 0xff;
+}
+
+static void
+update_buffer_image_param(struct brw_context *brw,
+                          struct gl_image_unit *u,
+                          unsigned surface_idx,
+                          struct brw_image_param *param)
+{
+   struct gl_buffer_object *obj = u->TexObj->BufferObject;
+
+   update_default_image_param(brw, u, surface_idx, param);
+
+   param->size[0] = obj->Size / _mesa_get_format_bytes(u->_ActualFormat);
+   param->stride[0] = _mesa_get_format_bytes(u->_ActualFormat);
+}
+
+static void
+update_texture_image_param(struct brw_context *brw,
+                           struct gl_image_unit *u,
+                           unsigned surface_idx,
+                           struct brw_image_param *param)
+{
+   struct intel_mipmap_tree *mt = intel_texture_object(u->TexObj)->mt;
+
+   update_default_image_param(brw, u, surface_idx, param);
+
+   param->size[0] = minify(mt->logical_width0, u->Level);
+   param->size[1] = minify(mt->logical_height0, u->Level);
+   param->size[2] = (!u->Layered ? 1 :
+                     u->TexObj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
+                     u->TexObj->Target == GL_TEXTURE_3D ?
+                     minify(mt->logical_depth0, u->Level) :
+                     mt->logical_depth0);
+
+   intel_miptree_get_image_offset(mt, u->Level, u->Layer,
+                                  &param->offset[0],
+                                  &param->offset[1]);
+
+   param->stride[0] = mt->cpp;
+   param->stride[1] = mt->pitch / mt->cpp;
+   param->stride[2] =
+      brw_miptree_get_horizontal_slice_pitch(brw, mt, u->Level);
+   param->stride[3] =
+      brw_miptree_get_vertical_slice_pitch(brw, mt, u->Level);
+
+   if (mt->tiling == I915_TILING_X) {
+      /* An X tile is a rectangular block of 512x8 bytes. */
+      param->tiling[0] = _mesa_logbase2(512 / mt->cpp);
+      param->tiling[1] = _mesa_logbase2(8);
+
+      if (brw->has_swizzling) {
+         /* Right shifts required to swizzle bits 9 and 10 of the memory
+          * address with bit 6.
+          */
+         param->swizzling[0] = 3;
+         param->swizzling[1] = 4;
+      }
+   } else if (mt->tiling == I915_TILING_Y) {
+      /* The layout of a Y-tiled surface in memory isn't really fundamentally
+       * different to the layout of an X-tiled surface, we simply pretend that
+       * the surface is broken up in a number of smaller 16Bx32 tiles, each
+       * one arranged in X-major order just like is the case for X-tiling.
+       */
+      param->tiling[0] = _mesa_logbase2(16 / mt->cpp);
+      param->tiling[1] = _mesa_logbase2(32);
+
+      if (brw->has_swizzling) {
+         /* Right shift required to swizzle bit 9 of the memory address with
+          * bit 6.
+          */
+         param->swizzling[0] = 3;
+      }
+   }
+
+   /* 3D textures are arranged in 2D in memory with 2^lod slices per row.  The
+    * address calculation algorithm (emit_address_calculation() in
+    * brw_fs_surface_builder.cpp) handles this as a sort of tiling with
+    * modulus equal to the LOD.
+    */
+   param->tiling[2] = (u->TexObj->Target == GL_TEXTURE_3D ? u->Level :
+                       0);
+}
+
+static void
+update_image_surface(struct brw_context *brw,
+                     struct gl_image_unit *u,
+                     GLenum access,
+                     unsigned surface_idx,
+                     uint32_t *surf_offset,
+                     struct brw_image_param *param)
+{
+   if (u->_Valid) {
+      struct gl_texture_object *obj = u->TexObj;
+      const unsigned format = get_image_format(brw, u->_ActualFormat, access);
+
+      if (obj->Target == GL_TEXTURE_BUFFER) {
+         struct intel_buffer_object *intel_obj =
+            intel_buffer_object(obj->BufferObject);
+         const unsigned texel_size = (format == BRW_SURFACEFORMAT_RAW ? 1 :
+                                      _mesa_get_format_bytes(u->_ActualFormat));
+
+         brw->vtbl.emit_buffer_surface_state(
+            brw, surf_offset, intel_obj->buffer, obj->BufferOffset,
+            format, intel_obj->Base.Size / texel_size, texel_size,
+            access != GL_READ_ONLY);
+
+         update_buffer_image_param(brw, u, surface_idx, param);
+
+      } else {
+         struct intel_texture_object *intel_obj = intel_texture_object(obj);
+         struct intel_mipmap_tree *mt = intel_obj->mt;
+
+         if (format == BRW_SURFACEFORMAT_RAW) {
+            brw->vtbl.emit_buffer_surface_state(
+               brw, surf_offset, mt->bo, mt->offset,
+               format, mt->bo->size - mt->offset, 1 /* pitch */,
+               access != GL_READ_ONLY);
+
+         } else {
+            const unsigned min_layer = obj->MinLayer + u->Layer;
+            const unsigned min_level = obj->MinLevel + u->Level;
+            const unsigned num_layers = (!u->Layered ? 1 :
+                                         obj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
+                                         mt->logical_depth0);
+            const GLenum target = (obj->Target == GL_TEXTURE_CUBE_MAP ||
+                                   obj->Target == GL_TEXTURE_CUBE_MAP_ARRAY ?
+                                   GL_TEXTURE_2D_ARRAY : obj->Target);
+
+            brw->vtbl.emit_texture_surface_state(
+               brw, mt, target,
+               min_layer, min_layer + num_layers,
+               min_level, min_level + 1,
+               format, SWIZZLE_XYZW,
+               surf_offset, access != GL_READ_ONLY, false);
+         }
+
+         update_texture_image_param(brw, u, surface_idx, param);
+      }
+
+   } else {
+      brw->vtbl.emit_null_surface_state(brw, 1, 1, 1, surf_offset);
+      update_default_image_param(brw, u, surface_idx, param);
+   }
+}
+
+void
+brw_upload_image_surfaces(struct brw_context *brw,
+                          struct gl_shader *shader,
+                          struct brw_stage_state *stage_state,
+                          struct brw_stage_prog_data *prog_data)
+{
+   struct gl_context *ctx = &brw->ctx;
+
+   if (shader && shader->NumImages) {
+      for (unsigned i = 0; i < shader->NumImages; i++) {
+         struct gl_image_unit *u = &ctx->ImageUnits[shader->ImageUnits[i]];
+         const unsigned surf_idx = prog_data->binding_table.image_start + i;
+
+         update_image_surface(brw, u, shader->ImageAccess[i],
+                              surf_idx,
+                              &stage_state->surf_offset[surf_idx],
+                              &prog_data->image_param[i]);
+      }
+
+      brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
+   }
+}
+
+static void
+brw_upload_wm_image_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct gl_shader_program *prog = ctx->Shader._CurrentFragmentProgram;
+
+   if (prog) {
+      /* BRW_NEW_FS_PROG_DATA, BRW_NEW_IMAGE_UNITS */
+      brw_upload_image_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_FRAGMENT],
+                                &brw->wm.base, &brw->wm.prog_data->base);
+   }
+}
+
+const struct brw_tracked_state brw_wm_image_surfaces = {
+   .dirty = {
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_FRAGMENT_PROGRAM |
+             BRW_NEW_FS_PROG_DATA |
+             BRW_NEW_IMAGE_UNITS
+   },
+   .emit = brw_upload_wm_image_surfaces,
+};
+
 void
 gen4_init_vtable_surface_functions(struct brw_context *brw)
 {
diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
index b6a3d78d849..54c4a6dfdd8 100644
--- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
@@ -821,7 +821,7 @@ gen6_blorp_emit_depth_stencil_config(struct brw_context *brw,
 
    /* 3DSTATE_DEPTH_BUFFER */
    {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
 
       BEGIN_BATCH(7);
       /* 3DSTATE_DEPTH_BUFFER dw0 */
@@ -896,7 +896,7 @@ static void
 gen6_blorp_emit_depth_disable(struct brw_context *brw,
                               const brw_blorp_params *params)
 {
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    BEGIN_BATCH(7);
    OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
@@ -1021,7 +1021,7 @@ gen6_blorp_exec(struct brw_context *brw,
    uint32_t prog_offset = params->get_wm_prog(brw, &prog_data);
 
    /* Emit workaround flushes when we switch from drawing to blorping. */
-   intel_emit_post_sync_nonzero_flush(brw);
+   brw_emit_post_sync_nonzero_flush(brw);
 
    gen6_emit_3dstate_multisample(brw, params->dst.num_samples);
    gen6_emit_3dstate_sample_mask(brw,
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 2bfa271b527..3bab8f46ae8 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -51,7 +51,7 @@ gen6_upload_blend_state(struct brw_context *brw)
     * with render target 0, which will reference BLEND_STATE[0] for
     * alpha test enable.
     */
-   if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
+   if (nr_draw_buffers == 0)
       nr_draw_buffers = 1;
 
    size = sizeof(*blend) * nr_draw_buffers;
@@ -97,8 +97,8 @@ gen6_upload_blend_state(struct brw_context *brw)
                    rb_type != GL_UNSIGNED_NORMALIZED &&
                    rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
                    "renderbuffer\n",
-                   _mesa_lookup_enum_by_nr(ctx->Color.LogicOp),
-                   _mesa_lookup_enum_by_nr(rb_type));
+                   _mesa_enum_to_string(ctx->Color.LogicOp),
+                   _mesa_enum_to_string(rb_type));
 	 if (rb_type == GL_UNSIGNED_NORMALIZED) {
 	    blend[b].blend1.logic_op_enable = 1;
 	    blend[b].blend1.logic_op_func =
diff --git a/src/mesa/drivers/dri/i965/gen6_depth_state.c b/src/mesa/drivers/dri/i965/gen6_depth_state.c
index 1df0bd47571..febd4781100 100644
--- a/src/mesa/drivers/dri/i965/gen6_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_depth_state.c
@@ -65,7 +65,7 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
     */
    bool enable_hiz_ss = hiz || separate_stencil;
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
    if (!irb)
@@ -73,7 +73,7 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
    rb = (struct gl_renderbuffer*) irb;
 
    if (rb) {
-      depth = MAX2(rb->Depth, 1);
+      depth = MAX2(irb->layer_count, 1);
       if (rb->TexImage)
          gl_target = rb->TexImage->TexObject->Target;
    }
@@ -89,6 +89,10 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
       surftype = BRW_SURFACE_2D;
       depth *= 6;
       break;
+   case GL_TEXTURE_3D:
+      assert(mt);
+      depth = MAX2(mt->logical_depth0, 1);
+      /* fallthrough */
    default:
       surftype = translate_tex_target(gl_target);
       break;
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 782687aac57..68e443d38a5 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -147,7 +147,12 @@ gen6_gs_visitor::emit_prolog()
 }
 
 void
-gen6_gs_visitor::visit(ir_emit_vertex *)
+gen6_gs_visitor::visit(ir_emit_vertex *ir)
+{
+   gs_emit_vertex(ir->stream_id());
+}
+void
+gen6_gs_visitor::gs_emit_vertex(int stream_id)
 {
    this->current_annotation = "gen6 emit vertex";
    /* Honor max_vertex layout indication in geometry shader by ignoring any
@@ -223,6 +228,12 @@ gen6_gs_visitor::visit(ir_emit_vertex *)
 
 void
 gen6_gs_visitor::visit(ir_end_primitive *)
+{
+   gs_end_primitive();
+}
+
+void
+gen6_gs_visitor::gs_end_primitive()
 {
    this->current_annotation = "gen6 end primitive";
    /* Calling EndPrimitive() is optional for point output. In this case we set
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index 27254ebb727..4cf94893261 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -36,12 +36,14 @@ class gen6_gs_visitor : public vec4_gs_visitor
 {
 public:
    gen6_gs_visitor(const struct brw_compiler *comp,
+                   void *log_data,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
                    bool no_spills,
                    int shader_time_index) :
-      vec4_gs_visitor(comp, c, prog, mem_ctx, no_spills, shader_time_index) {}
+      vec4_gs_visitor(comp, log_data, c, prog, mem_ctx, no_spills,
+                      shader_time_index) {}
 
 protected:
    virtual void assign_binding_table_offsets();
@@ -49,6 +51,8 @@ protected:
    virtual void emit_thread_end();
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
    virtual void emit_urb_write_header(int mrf);
    virtual void emit_urb_write_opcode(bool complete,
                                       int base_mrf,
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index 36734f598fe..8444c0c9bae 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -143,12 +143,11 @@ gen6_emit_3dstate_multisample(struct brw_context *brw,
    ADVANCE_BATCH();
 }
 
-
 unsigned
 gen6_determine_sample_mask(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   float coverage = 1.0;
+   float coverage = 1.0f;
    float coverage_invert = false;
    unsigned sample_mask = ~0u;
 
@@ -166,7 +165,7 @@ gen6_determine_sample_mask(struct brw_context *brw)
    }
 
    if (num_samples > 1) {
-      int coverage_int = (int) (num_samples * coverage + 0.5);
+      int coverage_int = (int) (num_samples * coverage + 0.5f);
       uint32_t coverage_bits = (1 << coverage_int) - 1;
       if (coverage_invert)
          coverage_bits ^= (1 << num_samples) - 1;
@@ -176,7 +175,6 @@ gen6_determine_sample_mask(struct brw_context *brw)
    }
 }
 
-
 /**
  * 3DSTATE_SAMPLE_MASK
  */
@@ -189,15 +187,14 @@ gen6_emit_3dstate_sample_mask(struct brw_context *brw, unsigned mask)
    ADVANCE_BATCH();
 }
 
-
-static void upload_multisample_state(struct brw_context *brw)
+static void
+upload_multisample_state(struct brw_context *brw)
 {
    /* BRW_NEW_NUM_SAMPLES */
    gen6_emit_3dstate_multisample(brw, brw->num_samples);
    gen6_emit_3dstate_sample_mask(brw, gen6_determine_sample_mask(brw));
 }
 
-
 const struct brw_tracked_state gen6_multisample_state = {
    .dirty = {
       .mesa = _NEW_MULTISAMPLE,
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index ba5c944fb3d..9f4a5db3592 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -86,7 +86,7 @@ static void
 write_primitives_generated(struct brw_context *brw,
                            drm_intel_bo *query_bo, int stream, int idx)
 {
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen >= 7 && stream > 0) {
       brw_store_register_mem64(brw, query_bo,
@@ -100,7 +100,7 @@ static void
 write_xfb_primitives_written(struct brw_context *brw,
                              drm_intel_bo *bo, int stream, int idx)
 {
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    if (brw->gen >= 7) {
       brw_store_register_mem64(brw, bo, GEN7_SO_NUM_PRIMS_WRITTEN(stream), idx);
@@ -157,7 +157,7 @@ emit_pipeline_stat(struct brw_context *brw, drm_intel_bo *bo,
    /* Emit a flush to make sure various parts of the pipeline are complete and
     * we get an accurate value
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    brw_store_register_mem64(brw, bo, reg, idx);
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index b00517ed81e..4068f2844a2 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -383,7 +383,7 @@ upload_sf_state(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw4 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw4 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /*
     * Window coordinates in an FBO are inverted, which means point
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index be80d7bdfc5..3899ce9451f 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -292,5 +292,5 @@ brw_end_transform_feedback(struct gl_context *ctx,
     * simplicity, just do a full flush.
     */
    struct brw_context *brw = brw_context(ctx);
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_surface_state.c b/src/mesa/drivers/dri/i965/gen6_surface_state.c
index 03e913a0a76..39de62f2304 100644
--- a/src/mesa/drivers/dri/i965/gen6_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_surface_state.c
@@ -88,7 +88,8 @@ gen6_update_renderbuffer_surface(struct brw_context *brw,
       break;
    }
 
-   const int min_array_element = layered ? 0 : irb->mt_layer;
+   const int min_array_element = irb->mt_layer;
+   assert(!layered || irb->mt_layer == 0);
 
    surf[0] = SET_FIELD(surftype, BRW_SURFACE_TYPE) |
              SET_FIELD(format, BRW_SURFACE_FORMAT);
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index 107a4f24fa6..c7311fd0b03 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -120,7 +120,7 @@ gen6_upload_urb( struct brw_context *brw )
     * a workaround.
     */
    if (brw->urb.gs_present && !gs_present)
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
    brw->urb.gs_present = gs_present;
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 7c8d8849f4e..11b9a360ced 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -101,7 +101,7 @@ gen6_upload_sf_vp(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
 
       /* _NEW_VIEWPORT */
       _mesa_get_viewport_xform(ctx, i, scale, translate);
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 2bdc82bc895..9822dc1fe79 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -645,7 +645,7 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
 
    /* 3DSTATE_DEPTH_BUFFER */
    {
-      intel_emit_depth_stall_flushes(brw);
+      brw_emit_depth_stall_flushes(brw);
 
       BEGIN_BATCH(7);
       OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
@@ -696,7 +696,7 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
 static void
 gen7_blorp_emit_depth_disable(struct brw_context *brw)
 {
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    BEGIN_BATCH(7);
    OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
@@ -794,6 +794,8 @@ gen7_blorp_exec(struct brw_context *brw,
    }
    depthstencil_offset = gen6_blorp_emit_depth_stencil_state(brw, params);
    gen7_blorp_emit_depth_stencil_state_pointers(brw, depthstencil_offset);
+   if (brw->use_resource_streamer)
+      gen7_disable_hw_binding_tables(brw);
    if (params->use_wm_prog) {
       uint32_t wm_surf_offset_renderbuffer;
       uint32_t wm_surf_offset_texture = 0;
diff --git a/src/mesa/drivers/dri/i965/gen7_disable.c b/src/mesa/drivers/dri/i965/gen7_disable.c
index 2c43cd77f07..bb509696d72 100644
--- a/src/mesa/drivers/dri/i965/gen7_disable.c
+++ b/src/mesa/drivers/dri/i965/gen7_disable.c
@@ -52,7 +52,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_HS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    /* Disable the TE */
@@ -85,7 +85,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_DS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index 8d6d3fe1d34..497ecec8e45 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -59,7 +59,9 @@ upload_gs_state(struct brw_context *brw)
       OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((brw->gs.prog_data->base.base.binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+                (brw->is_haswell && prog_data->base.nr_image_params ?
+                 HSW_GS_UAV_ACCESS_ENABLE : 0));
 
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
index f4f665219d6..a14d4a0c50d 100644
--- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_misc_state.c
@@ -57,7 +57,7 @@ gen7_emit_depth_stencil_hiz(struct brw_context *brw,
       return;
    }
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
    if (!irb)
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 4fa46a8eb97..698b3d491bc 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -220,7 +220,7 @@ upload_sf_state(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw3 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw3 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /* _NEW_LIGHT */
    if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index aec4f44bb73..41573a80a52 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -365,7 +365,7 @@ gen7_save_primitives_written_counters(struct brw_context *brw,
    }
 
    /* Flush any drawing so that the counters have the right values. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Emit MI_STORE_REGISTER_MEM commands to write the values. */
    for (int i = 0; i < streams; i++) {
@@ -502,7 +502,7 @@ gen7_pause_transform_feedback(struct gl_context *ctx,
       (struct brw_transform_feedback_object *) obj;
 
    /* Flush any drawing so that the counters have the right values. */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the SOL buffer offset register values. */
    if (brw->gen < 8) {
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index d371c193577..69162171c4e 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -228,7 +228,7 @@ gen7_upload_urb(struct brw_context *brw)
       remaining_space = total_wants;
    if (remaining_space > 0) {
       unsigned vs_additional = (unsigned)
-         round(vs_wants * (((double) remaining_space) / total_wants));
+         roundf(vs_wants * (((float) remaining_space) / total_wants));
       vs_chunks += vs_additional;
       remaining_space -= vs_additional;
       gs_chunks += remaining_space;
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
index b655205ec35..c75dc9964bf 100644
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
@@ -53,7 +53,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       /* According to the "Vertex X,Y Clamping and Quantization" section of
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 4b17d06fa83..b7e48585482 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -62,6 +62,7 @@ gen7_upload_constant_state(struct brw_context *brw,
       OUT_BATCH(active ? stage_state->push_const_size : 0);
       OUT_BATCH(0);
    }
+
    /* Pointer to the constant buffer.  Covered by the set of state flags
     * from gen6_prepare_wm_contants
     */
@@ -95,15 +96,14 @@ gen7_upload_constant_state(struct brw_context *brw,
 
    ADVANCE_BATCH();
 
-  /* On SKL+ the new constants don't take effect until the next corresponding
-   * 3DSTATE_BINDING_TABLE_POINTER_* command is parsed so we need to ensure
-   * that is sent
-   */
+   /* On SKL+ the new constants don't take effect until the next corresponding
+    * 3DSTATE_BINDING_TABLE_POINTER_* command is parsed so we need to ensure
+    * that is sent
+    */
    if (brw->gen >= 9)
       brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
 }
 
-
 static void
 upload_vs_state(struct brw_context *brw)
 {
@@ -111,6 +111,7 @@ upload_vs_state(struct brw_context *brw)
    uint32_t floating_point_mode = 0;
    const int max_threads_shift = brw->is_haswell ?
       HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
+   const struct brw_vue_prog_data *prog_data = &brw->vs.prog_data->base;
 
    if (!brw->is_haswell && !brw->is_baytrail)
       gen7_emit_vs_workaround_flush(brw);
@@ -125,19 +126,21 @@ upload_vs_state(struct brw_context *brw)
 	     ((ALIGN(stage_state->sampler_count, 4)/4) <<
               GEN6_VS_SAMPLER_COUNT_SHIFT) |
              ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) <<
-              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+             (brw->is_haswell && prog_data->base.nr_image_params ?
+              HSW_VS_UAV_ACCESS_ENABLE : 0));
 
-   if (brw->vs.prog_data->base.base.total_scratch) {
+   if (prog_data->base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
 		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		ffs(brw->vs.prog_data->base.base.total_scratch) - 11);
+		ffs(prog_data->base.total_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
 
-   OUT_BATCH((brw->vs.prog_data->base.base.dispatch_grf_start_reg <<
+   OUT_BATCH((prog_data->base.dispatch_grf_start_reg <<
               GEN6_VS_DISPATCH_START_GRF_SHIFT) |
-	     (brw->vs.prog_data->base.urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
+	     (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
 	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
 
    OUT_BATCH(((brw->max_vs_threads - 1) << max_threads_shift) |
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index ea11ae845e3..fd6dab5be8b 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -83,6 +83,7 @@ upload_wm_state(struct brw_context *brw)
 
    /* _NEW_BUFFERS | _NEW_COLOR */
    if (brw_color_buffer_write_enabled(brw) || writes_depth ||
+       prog_data->base.nr_image_params ||
        dw1 & GEN7_WM_KILL_ENABLE) {
       dw1 |= GEN7_WM_DISPATCH_ENABLE;
    }
@@ -106,6 +107,18 @@ upload_wm_state(struct brw_context *brw)
       dw1 |= GEN7_WM_USES_INPUT_COVERAGE_MASK;
    }
 
+   /* BRW_NEW_FS_PROG_DATA */
+   if (prog_data->early_fragment_tests)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
+   else if (prog_data->base.nr_image_params)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
+
+   /* _NEW_BUFFERS | _NEW_COLOR */
+   if (brw->is_haswell &&
+       !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
+       prog_data->base.nr_image_params)
+      dw2 |= HSW_WM_UAV_ONLY;
+
    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
    OUT_BATCH(dw1);
@@ -127,7 +140,7 @@ const struct brw_tracked_state gen7_wm_state = {
    .emit = upload_wm_state,
 };
 
-void
+static void
 gen7_upload_ps_state(struct brw_context *brw,
                      const struct gl_fragment_program *fp,
                      const struct brw_stage_state *stage_state,
@@ -208,6 +221,9 @@ gen7_upload_ps_state(struct brw_context *brw,
       _mesa_get_min_invocations_per_fragment(ctx, fp, false);
    assert(min_inv_per_frag >= 1);
 
+   if (brw->is_haswell && prog_data->base.nr_image_params)
+      dw4 |= HSW_PS_UAV_ACCESS_ENABLE;
+
    if (prog_data->prog_offset_16 || prog_data->no_8) {
       dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
       if (!prog_data->no_8 && min_inv_per_frag == 1) {
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 12ac97a5d14..93100a0708f 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -41,7 +41,6 @@ emit_depth_packets(struct brw_context *brw,
                    bool depth_writable,
                    struct intel_mipmap_tree *stencil_mt,
                    bool stencil_writable,
-                   uint32_t stencil_offset,
                    bool hiz,
                    uint32_t width,
                    uint32_t height,
@@ -57,7 +56,7 @@ emit_depth_packets(struct brw_context *brw,
       return;
    }
 
-   intel_emit_depth_stall_flushes(brw);
+   brw_emit_depth_stall_flushes(brw);
 
    /* _NEW_BUFFERS, _NEW_DEPTH, _NEW_STENCIL */
    BEGIN_BATCH(8);
@@ -100,7 +99,7 @@ emit_depth_packets(struct brw_context *brw,
    }
 
    if (stencil_mt == NULL) {
-     BEGIN_BATCH(5);
+      BEGIN_BATCH(5);
       OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2));
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -127,8 +126,7 @@ emit_depth_packets(struct brw_context *brw,
       OUT_BATCH(HSW_STENCIL_ENABLED | mocs_wb << 22 |
                 (2 * stencil_mt->pitch - 1));
       OUT_RELOC64(stencil_mt->bo,
-                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  stencil_offset);
+                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
       OUT_BATCH(stencil_mt ? stencil_mt->qpitch >> 2 : 0);
       ADVANCE_BATCH();
    }
@@ -220,7 +218,6 @@ gen8_emit_depth_stencil_hiz(struct brw_context *brw,
    emit_depth_packets(brw, depth_mt, brw_depthbuffer_format(brw), surftype,
                       ctx->Depth.Mask != 0,
                       stencil_mt, ctx->Stencil._WriteEnabled,
-                      brw->depthstencil.stencil_offset,
                       hiz, width, height, depth, lod, min_array_element);
 }
 
@@ -253,10 +250,10 @@ pma_fix_enable(const struct brw_context *brw)
     */
    const bool hiz_enabled = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
 
-   /* 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
-    * We always leave this set to EDSC_NORMAL (0).
+   /* BRW_NEW_FS_PROG_DATA:
+    * 3DSTATE_WM::Early Depth/Stencil Control != EDSC_PREPS (2).
     */
-   const bool edsc_not_preps = true;
+   const bool edsc_not_preps = !brw->wm.prog_data->early_fragment_tests;
 
    /* 3DSTATE_PS_EXTRA::PixelShaderValid is always true. */
    const bool pixel_shader_valid = true;
@@ -439,7 +436,7 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
                       brw_depth_format(brw, mt->format),
                       BRW_SURFACE_2D,
                       true, /* depth writes */
-                      NULL, false, 0, /* no stencil for now */
+                      NULL, false, /* no stencil for now */
                       true, /* hiz */
                       surface_width,
                       surface_height,
@@ -499,7 +496,7 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
     */
    brw_emit_pipe_control_write(brw,
                                PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0, 0, 0);
+                               brw->workaround_bo, 0, 0, 0);
 
    /* Emit 3DSTATE_WM_HZ_OP again to disable the state overrides. */
    BEGIN_BATCH(5);
diff --git a/src/mesa/drivers/dri/i965/gen8_disable.c b/src/mesa/drivers/dri/i965/gen8_disable.c
index da0d4a5fe7a..32508e377c9 100644
--- a/src/mesa/drivers/dri/i965/gen8_disable.c
+++ b/src/mesa/drivers/dri/i965/gen8_disable.c
@@ -66,7 +66,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_HS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    /* Disable the TE */
@@ -101,7 +101,7 @@ disable_stages(struct brw_context *brw)
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_DS << 16 | (2 - 2));
-   OUT_BATCH(0);
+   OUT_BATCH(brw->hw_bt_pool.next_offset);
    ADVANCE_BATCH();
 
    BEGIN_BATCH(2);
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 26a02d3b045..81bd3b21778 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -52,7 +52,9 @@ gen8_upload_gs_state(struct brw_context *brw)
                 ((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((prog_data->base.binding_table.size_bytes / 4) <<
-                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+                (prog_data->base.nr_image_params ?
+                 HSW_GS_UAV_ACCESS_ENABLE : 0));
 
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index a88f109c691..ae18f0f162c 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -58,7 +58,11 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (prog_data->uses_omask)
       dw1 |= GEN8_PSX_OMASK_TO_RENDER_TARGET;
 
-   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx))
+   if (brw->gen >= 9 && prog_data->pulls_bary)
+      dw1 |= GEN9_PSX_SHADER_PULLS_BARY;
+
+   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) ||
+       prog_data->base.nr_image_params)
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 
    BEGIN_BATCH(2);
@@ -115,6 +119,12 @@ upload_wm_state(struct brw_context *brw)
    dw1 |= brw->wm.prog_data->barycentric_interp_modes <<
       GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT;
 
+   /* BRW_NEW_FS_PROG_DATA */
+   if (brw->wm.prog_data->early_fragment_tests)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
+   else if (brw->wm.prog_data->base.nr_image_params)
+      dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
+
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_WM << 16 | (2 - 2));
    OUT_BATCH(dw1);
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index c2b585d0001..6b655ee493e 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -169,7 +169,7 @@ upload_sf(struct brw_context *brw)
    point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
 
    /* Clamp to the hardware limits and convert to fixed point */
-   dw3 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3);
+   dw3 |= U_FIXED(CLAMP(point_size, 0.125f, 255.875f), 3);
 
    /* _NEW_PROGRAM | _NEW_POINT */
    if (!(ctx->VertexProgram.PointSizeEnabled || ctx->Point._Attenuated))
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index b2d1a579815..6c4d3e197a5 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -88,12 +88,12 @@ vertical_alignment(const struct brw_context *brw,
                    uint32_t surf_type)
 {
    /* On Gen9+ vertical alignment is ignored for 1D surfaces and when
-    * tr_mode is not TRMODE_NONE.
+    * tr_mode is not TRMODE_NONE. Set to an arbitrary non-reserved value.
     */
    if (brw->gen > 8 &&
        (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
         surf_type == BRW_SURFACE_1D))
-      return 0;
+      return GEN8_SURFACE_VALIGN_4;
 
    switch (mt->align_h) {
    case 4:
@@ -113,12 +113,12 @@ horizontal_alignment(const struct brw_context *brw,
                      uint32_t surf_type)
 {
    /* On Gen9+ horizontal alignment is ignored when tr_mode is not
-    * TRMODE_NONE.
+    * TRMODE_NONE. Set to an arbitrary non-reserved value.
     */
    if (brw->gen > 8 &&
        (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
         gen9_use_linear_1d_layout(brw, mt)))
-      return 0;
+      return GEN8_SURFACE_HALIGN_4;
 
    switch (mt->align_w) {
    case 4:
@@ -401,8 +401,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
       irb->mt_layer : (irb->mt_layer / MAX2(mt->num_samples, 1));
    GLenum gl_target =
       rb->TexImage ? rb->TexImage->TexObject->Target : GL_TEXTURE_2D;
-   /* FINISHME: Use PTE MOCS on Skylake. */
-   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_WT : BDW_MOCS_PTE;
+   const uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_PTE : BDW_MOCS_PTE;
 
    intel_miptree_used_for_rendering(mt);
 
diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
index 2d8eeb1f10f..2692ad55999 100644
--- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
@@ -53,7 +53,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       /* _NEW_VIEWPORT: Viewport Matrix Elements */
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index 28f5adddf14..8b5048bee7e 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -53,7 +53,9 @@ upload_vs_state(struct brw_context *brw)
              ((ALIGN(stage_state->sampler_count, 4) / 4) <<
                GEN6_VS_SAMPLER_COUNT_SHIFT) |
              ((prog_data->base.binding_table.size_bytes / 4) <<
-               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) |
+             (prog_data->base.nr_image_params ?
+              HSW_VS_UAV_ACCESS_ENABLE : 0));
 
    if (prog_data->base.total_scratch) {
       OUT_RELOC64(stage_state->scratch_bo,
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index ed659ed625e..85f20a05729 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -32,6 +32,8 @@
 #include "intel_buffers.h"
 #include "intel_fbo.h"
 #include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
 
 #include <xf86drm.h>
 #include <i915_drm.h>
@@ -44,19 +46,10 @@ intel_batchbuffer_init(struct brw_context *brw)
 {
    intel_batchbuffer_reset(brw);
 
-   if (brw->gen >= 6) {
-      /* We can't just use brw_state_batch to get a chunk of space for
-       * the gen6 workaround because it involves actually writing to
-       * the buffer, and the kernel doesn't let us write to the batch.
-       */
-      brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
-						      "pipe_control workaround",
-						      4096, 4096);
-   }
-
    if (!brw->has_llc) {
       brw->batch.cpu_map = malloc(BATCH_SZ);
       brw->batch.map = brw->batch.cpu_map;
+      brw->batch.map_next = brw->batch.cpu_map;
    }
 }
 
@@ -77,12 +70,11 @@ intel_batchbuffer_reset(struct brw_context *brw)
       drm_intel_bo_map(brw->batch.bo, true);
       brw->batch.map = brw->batch.bo->virtual;
    }
+   brw->batch.map_next = brw->batch.map;
 
    brw->batch.reserved_space = BATCH_RESERVED;
    brw->batch.state_batch_offset = brw->batch.bo->size;
-   brw->batch.used = 0;
    brw->batch.needs_sol_reset = false;
-   brw->batch.pipe_controls_since_last_cs_stall = 0;
 
    /* We don't know what ring the new batch will be sent to until we see the
     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
@@ -93,7 +85,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
 void
 intel_batchbuffer_save_state(struct brw_context *brw)
 {
-   brw->batch.saved.used = brw->batch.used;
+   brw->batch.saved.map_next = brw->batch.map_next;
    brw->batch.saved.reloc_count =
       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
 }
@@ -103,8 +95,8 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 {
    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
 
-   brw->batch.used = brw->batch.saved.used;
-   if (brw->batch.used == 0)
+   brw->batch.map_next = brw->batch.saved.map_next;
+   if (USED_BATCH(brw->batch) == 0)
       brw->batch.ring = UNKNOWN_RING;
 }
 
@@ -114,7 +106,6 @@ intel_batchbuffer_free(struct brw_context *brw)
    free(brw->batch.cpu_map);
    drm_intel_bo_unreference(brw->batch.last_bo);
    drm_intel_bo_unreference(brw->batch.bo);
-   drm_intel_bo_unreference(brw->batch.workaround_bo);
 }
 
 static void
@@ -133,7 +124,7 @@ do_batch_dump(struct brw_context *brw)
       drm_intel_decode_set_batch_pointer(decode,
 					 batch->bo->virtual,
 					 batch->bo->offset64,
-					 batch->used);
+                                         USED_BATCH(*batch));
    } else {
       fprintf(stderr,
 	      "WARNING: failed to map batchbuffer (%s), "
@@ -142,7 +133,7 @@ do_batch_dump(struct brw_context *brw)
       drm_intel_decode_set_batch_pointer(decode,
 					 batch->map,
 					 batch->bo->offset64,
-					 batch->used);
+                                         USED_BATCH(*batch));
    }
 
    drm_intel_decode_set_output_file(decode, stderr);
@@ -218,10 +209,32 @@ brw_finish_batch(struct brw_context *brw)
     */
    brw_emit_query_end(brw);
 
-   /* We may also need to snapshot and disable OA counters. */
-   if (brw->batch.ring == RENDER_RING)
+   if (brw->batch.ring == RENDER_RING) {
+      /* We may also need to snapshot and disable OA counters. */
       brw_perf_monitor_finish_batch(brw);
 
+      if (brw->is_haswell) {
+         /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
+          * 3DSTATE_CC_STATE_POINTERS > "Note":
+          *
+          * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
+          *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
+          *
+          * From the example in the docs, it seems to expect a regular pipe control
+          * flush here as well. We may have done it already, but meh.
+          *
+          * See also WaAvoidRCZCounterRollover.
+          */
+         brw_emit_mi_flush(brw);
+         BEGIN_BATCH(2);
+         OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
+         OUT_BATCH(brw->cc.state_offset | 1);
+         ADVANCE_BATCH();
+         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                          PIPE_CONTROL_CS_STALL);
+      }
+   }
+
    /* Mark that the current program cache BO has been used by the GPU.
     * It will be reallocated if we need to put new programs in for the
     * next batch.
@@ -267,6 +280,11 @@ throttle(struct brw_context *brw)
    }
 }
 
+/* Drop when RS headers get pulled to libdrm */
+#ifndef I915_EXEC_RESOURCE_STREAMER
+#define I915_EXEC_RESOURCE_STREAMER (1<<15)
+#endif
+
 /* TODO: Push this whole function into bufmgr.
  */
 static int
@@ -278,7 +296,7 @@ do_flush_locked(struct brw_context *brw)
    if (brw->has_llc) {
       drm_intel_bo_unmap(batch->bo);
    } else {
-      ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
+      ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 	 ret = drm_intel_bo_subdata(batch->bo,
 				    batch->state_batch_offset,
@@ -293,7 +311,8 @@ do_flush_locked(struct brw_context *brw)
       if (brw->gen >= 6 && batch->ring == BLT_RING) {
          flags = I915_EXEC_BLT;
       } else {
-         flags = I915_EXEC_RENDER;
+         flags = I915_EXEC_RENDER |
+            (brw->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0);
       }
       if (batch->needs_sol_reset)
 	 flags |= I915_EXEC_GEN7_SOL_RESET;
@@ -303,11 +322,11 @@ do_flush_locked(struct brw_context *brw)
             brw_annotate_aub(brw);
 
 	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
-	    ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
-					flags);
+            ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
+                                        NULL, 0, 0, flags);
 	 } else {
 	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
-						4 * batch->used, flags);
+                                                4 * USED_BATCH(*batch), flags);
 	 }
       }
 
@@ -331,7 +350,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 {
    int ret;
 
-   if (brw->batch.used == 0)
+   if (USED_BATCH(brw->batch) == 0)
       return 0;
 
    if (brw->throttle_batch[0] == NULL) {
@@ -340,7 +359,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
-      int bytes_for_commands = 4 * brw->batch.used;
+      int bytes_for_commands = 4 * USED_BATCH(brw->batch);
       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
       int total_bytes = bytes_for_commands + bytes_for_state;
       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
@@ -356,7 +375,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 
    /* Mark the end of the buffer. */
    intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
-   if (brw->batch.used & 1) {
+   if (USED_BATCH(brw->batch) & 1) {
       /* Round batchbuffer usage to 2 DWORDs. */
       intel_batchbuffer_emit_dword(brw, MI_NOOP);
    }
@@ -373,6 +392,9 @@ _intel_batchbuffer_flush(struct brw_context *brw,
       drm_intel_bo_wait_rendering(brw->batch.bo);
    }
 
+   if (brw->use_resource_streamer)
+      gen7_reset_hw_bt_pool_offsets(brw);
+
    /* Start a new batch buffer. */
    brw_new_batch(brw);
 
@@ -382,15 +404,15 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 
 /*  This is the only way buffers get added to the validate list.
  */
-bool
-intel_batchbuffer_emit_reloc(struct brw_context *brw,
-                             drm_intel_bo *buffer,
-                             uint32_t read_domains, uint32_t write_domain,
-			     uint32_t delta)
+uint32_t
+intel_batchbuffer_reloc(struct brw_context *brw,
+                        drm_intel_bo *buffer, uint32_t offset,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t delta)
 {
    int ret;
 
-   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
 				 buffer, delta,
 				 read_domains, write_domain);
    assert(ret == 0);
@@ -400,18 +422,16 @@ intel_batchbuffer_emit_reloc(struct brw_context *brw,
     * case the buffer doesn't move and we can short-circuit the relocation
     * processing in the kernel
     */
-   intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
-
-   return true;
+   return buffer->offset64 + delta;
 }
 
-bool
-intel_batchbuffer_emit_reloc64(struct brw_context *brw,
-                               drm_intel_bo *buffer,
-                               uint32_t read_domains, uint32_t write_domain,
-			       uint32_t delta)
+uint64_t
+intel_batchbuffer_reloc64(struct brw_context *brw,
+                          drm_intel_bo *buffer, uint32_t offset,
+                          uint32_t read_domains, uint32_t write_domain,
+                          uint32_t delta)
 {
-   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
                                      buffer, delta,
                                      read_domains, write_domain);
    assert(ret == 0);
@@ -421,11 +441,7 @@ intel_batchbuffer_emit_reloc64(struct brw_context *brw,
     * case the buffer doesn't move and we can short-circuit the relocation
     * processing in the kernel
     */
-   uint64_t offset = buffer->offset64 + delta;
-   intel_batchbuffer_emit_dword(brw, offset);
-   intel_batchbuffer_emit_dword(brw, offset >> 32);
-
-   return true;
+   return buffer->offset64 + delta;
 }
 
 
@@ -435,312 +451,8 @@ intel_batchbuffer_data(struct brw_context *brw,
 {
    assert((bytes & 3) == 0);
    intel_batchbuffer_require_space(brw, bytes, ring);
-   memcpy(brw->batch.map + brw->batch.used, data, bytes);
-   brw->batch.used += bytes >> 2;
-}
-
-/**
- * According to the latest documentation, any PIPE_CONTROL with the
- * "Command Streamer Stall" bit set must also have another bit set,
- * with five different options:
- *
- *  - Render Target Cache Flush
- *  - Depth Cache Flush
- *  - Stall at Pixel Scoreboard
- *  - Post-Sync Operation
- *  - Depth Stall
- *
- * I chose "Stall at Pixel Scoreboard" since we've used it effectively
- * in the past, but the choice is fairly arbitrary.
- */
-static void
-gen8_add_cs_stall_workaround_bits(uint32_t *flags)
-{
-   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                      PIPE_CONTROL_WRITE_IMMEDIATE |
-                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
-                      PIPE_CONTROL_WRITE_TIMESTAMP |
-                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
-                      PIPE_CONTROL_DEPTH_STALL;
-
-   /* If we're doing a CS stall, and don't already have one of the
-    * workaround bits set, add "Stall at Pixel Scoreboard."
-    */
-   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
-      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
-}
-
-/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
- *
- * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
- *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
- *
- * Note that the kernel does CS stalls between batches, so we only need
- * to count them within a batch.
- */
-static uint32_t
-gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
-{
-   if (brw->gen == 7 && !brw->is_haswell) {
-      if (flags & PIPE_CONTROL_CS_STALL) {
-         /* If we're doing a CS stall, reset the counter and carry on. */
-         brw->batch.pipe_controls_since_last_cs_stall = 0;
-         return 0;
-      }
-
-      /* If this is the fourth pipe control without a CS stall, do one now. */
-      if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
-         brw->batch.pipe_controls_since_last_cs_stall = 0;
-         return PIPE_CONTROL_CS_STALL;
-      }
-   }
-   return 0;
-}
-
-/**
- * Emit a PIPE_CONTROL with various flushing flags.
- *
- * The caller is responsible for deciding what flags are appropriate for the
- * given generation.
- */
-void
-brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
-{
-   if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
-      OUT_BATCH(flags);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else if (brw->gen >= 6) {
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(flags);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * Emit a PIPE_CONTROL that writes to a buffer object.
- *
- * \p flags should contain one of the following items:
- *  - PIPE_CONTROL_WRITE_IMMEDIATE
- *  - PIPE_CONTROL_WRITE_TIMESTAMP
- *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
- */
-void
-brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
-                            drm_intel_bo *bo, uint32_t offset,
-                            uint32_t imm_lower, uint32_t imm_upper)
-{
-   if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
-
-      BEGIN_BATCH(6);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
-      OUT_BATCH(flags);
-      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                  offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   } else if (brw->gen >= 6) {
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
-
-      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
-       * on later platforms.  We always use PPGTT on Gen7+.
-       */
-      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(flags);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                gen6_gtt | offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
-      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
-      OUT_BATCH(imm_lower);
-      OUT_BATCH(imm_upper);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * Restriction [DevSNB, DevIVB]:
- *
- * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
- * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
- * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
- * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
- * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
- * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
- * unless SW can otherwise guarantee that the pipeline from WM onwards is
- * already flushed (e.g., via a preceding MI_FLUSH).
- */
-void
-intel_emit_depth_stall_flushes(struct brw_context *brw)
-{
-   assert(brw->gen >= 6 && brw->gen <= 9);
-
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
-   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
-}
-
-/**
- * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
- * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
- *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
- *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
- *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
- *  to be sent before any combination of VS associated 3DSTATE."
- */
-void
-gen7_emit_vs_workaround_flush(struct brw_context *brw)
-{
-   assert(brw->gen == 7);
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_WRITE_IMMEDIATE
-                               | PIPE_CONTROL_DEPTH_STALL,
-                               brw->batch.workaround_bo, 0,
-                               0, 0);
-}
-
-
-/**
- * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
- */
-void
-gen7_emit_cs_stall_flush(struct brw_context *brw)
-{
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_CS_STALL
-                               | PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0,
-                               0, 0);
-}
-
-
-/**
- * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
- * implementing two workarounds on gen6.  From section 1.4.7.1
- * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
- *
- * [DevSNB-C+{W/A}] Before any depth stall flush (including those
- * produced by non-pipelined state commands), software needs to first
- * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
- * 0.
- *
- * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
- * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
- *
- * And the workaround for these two requires this workaround first:
- *
- * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
- * BEFORE the pipe-control with a post-sync op and no write-cache
- * flushes.
- *
- * And this last workaround is tricky because of the requirements on
- * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
- * volume 2 part 1:
- *
- *     "1 of the following must also be set:
- *      - Render Target Cache Flush Enable ([12] of DW1)
- *      - Depth Cache Flush Enable ([0] of DW1)
- *      - Stall at Pixel Scoreboard ([1] of DW1)
- *      - Depth Stall ([13] of DW1)
- *      - Post-Sync Operation ([13] of DW1)
- *      - Notify Enable ([8] of DW1)"
- *
- * The cache flushes require the workaround flush that triggered this
- * one, so we can't use it.  Depth stall would trigger the same.
- * Post-sync nonzero is what triggered this second workaround, so we
- * can't use that one either.  Notify enable is IRQs, which aren't
- * really our business.  That leaves only stall at scoreboard.
- */
-void
-intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
-{
-   brw_emit_pipe_control_flush(brw,
-                               PIPE_CONTROL_CS_STALL |
-                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
-
-   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
-                               brw->batch.workaround_bo, 0, 0, 0);
-}
-
-/* Emit a pipelined flush to either flush render and texture cache for
- * reading from a FBO-drawn texture, or flush so that frontbuffer
- * render appears on the screen in DRI1.
- *
- * This is also used for the always_flush_cache driconf debug option.
- */
-void
-intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
-{
-   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
-      BEGIN_BATCH_BLT(4);
-      OUT_BATCH(MI_FLUSH_DW);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
-      if (brw->gen >= 6) {
-         if (brw->gen == 9) {
-            /* Hardware workaround: SKL
-             *
-             * Emit Pipe Control with all bits set to zero before emitting
-             * a Pipe Control with VF Cache Invalidate set.
-             */
-            brw_emit_pipe_control_flush(brw, 0);
-         }
-
-         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-                  PIPE_CONTROL_CS_STALL;
-
-         if (brw->gen == 6) {
-            /* Hardware workaround: SNB B-Spec says:
-             *
-             * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
-             * Flush Enable =1, a PIPE_CONTROL with any non-zero
-             * post-sync-op is required.
-             */
-            intel_emit_post_sync_nonzero_flush(brw);
-         }
-      }
-      brw_emit_pipe_control_flush(brw, flags);
-   }
-
-   brw_render_cache_set_clear(brw);
+   memcpy(brw->batch.map_next, data, bytes);
+   brw->batch.map_next += bytes >> 2;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index 7bdd8364346..84add927c9a 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -22,12 +22,16 @@ extern "C" {
  *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
  *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
  *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
- *       which are 4 DWords each ==> 2 * 3 * 4 * 4 = 96 bytes
+ *       which are 5 DWords each ==> 2 * 3 * 5 * 4 = 120 bytes
  *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
  *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
  *       Sandybridge PIPE_CONTROL madness.
+ *   - CC_STATE workaround on HSW (12 * 4 = 48 bytes)
+ *     - 5 dwords for initial mi_flush
+ *     - 2 dwords for CC state setup
+ *     - 5 dwords for the required pipe control at the end
  */
-#define BATCH_RESERVED 146
+#define BATCH_RESERVED 152
 
 struct intel_batchbuffer;
 
@@ -53,25 +57,20 @@ void intel_batchbuffer_data(struct brw_context *brw,
                             const void *data, GLuint bytes,
                             enum brw_gpu_ring ring);
 
-bool intel_batchbuffer_emit_reloc(struct brw_context *brw,
-                                       drm_intel_bo *buffer,
-				       uint32_t read_domains,
-				       uint32_t write_domain,
-				       uint32_t offset);
-bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
-                                    drm_intel_bo *buffer,
-                                    uint32_t read_domains,
-                                    uint32_t write_domain,
-                                    uint32_t offset);
-void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
-void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
-                                 drm_intel_bo *bo, uint32_t offset,
-                                 uint32_t imm_lower, uint32_t imm_upper);
-void intel_batchbuffer_emit_mi_flush(struct brw_context *brw);
-void intel_emit_post_sync_nonzero_flush(struct brw_context *brw);
-void intel_emit_depth_stall_flushes(struct brw_context *brw);
-void gen7_emit_vs_workaround_flush(struct brw_context *brw);
-void gen7_emit_cs_stall_flush(struct brw_context *brw);
+uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
+                                 drm_intel_bo *buffer,
+                                 uint32_t offset,
+                                 uint32_t read_domains,
+                                 uint32_t write_domain,
+                                 uint32_t delta);
+uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
+                                   drm_intel_bo *buffer,
+                                   uint32_t offset,
+                                   uint32_t read_domains,
+                                   uint32_t write_domain,
+                                   uint32_t delta);
+
+#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))
 
 static inline uint32_t float_as_int(float f)
 {
@@ -93,7 +92,7 @@ static inline unsigned
 intel_batchbuffer_space(struct brw_context *brw)
 {
    return (brw->batch.state_batch_offset - brw->batch.reserved_space)
-      - brw->batch.used*4;
+      - USED_BATCH(brw->batch) * 4;
 }
 
 
@@ -103,7 +102,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
 #ifdef DEBUG
    assert(intel_batchbuffer_space(brw) >= 4);
 #endif
-   brw->batch.map[brw->batch.used++] = dword;
+   *brw->batch.map_next++ = dword;
    assert(brw->batch.ring != UNKNOWN_RING);
 }
 
@@ -144,8 +143,8 @@ intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
 {
    intel_batchbuffer_require_space(brw, n * 4, ring);
 
-   brw->batch.emit = brw->batch.used;
 #ifdef DEBUG
+   brw->batch.emit = USED_BATCH(brw->batch);
    brw->batch.total = n;
 #endif
 }
@@ -155,7 +154,7 @@ intel_batchbuffer_advance(struct brw_context *brw)
 {
 #ifdef DEBUG
    struct intel_batchbuffer *batch = &brw->batch;
-   unsigned int _n = batch->used - batch->emit;
+   unsigned int _n = USED_BATCH(*batch) - batch->emit;
    assert(batch->total != 0);
    if (_n != batch->total) {
       fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",
@@ -166,21 +165,42 @@ intel_batchbuffer_advance(struct brw_context *brw)
 #endif
 }
 
-#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
-#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
-#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
-#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
-   intel_batchbuffer_emit_reloc(brw, buf,			\
-				read_domains, write_domain, delta);	\
+#define BEGIN_BATCH(n) do {                            \
+   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
+
+#define BEGIN_BATCH_BLT(n) do {                        \
+   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
+
+#define OUT_BATCH(d) *__map++ = (d)
+#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do { \
+   uint32_t __offset = (__map - brw->batch.map) * 4;           \
+   OUT_BATCH(intel_batchbuffer_reloc(brw, (buf), __offset,     \
+                                     (read_domains),           \
+                                     (write_domain),           \
+                                     (delta)));                \
 } while (0)
 
 /* Handle 48-bit address relocations for Gen8+ */
-#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
-   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);	\
+#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {      \
+   uint32_t __offset = (__map - brw->batch.map) * 4;                  \
+   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, (buf), __offset, \
+                                                (read_domains),       \
+                                                (write_domain),       \
+                                                (delta));             \
+   OUT_BATCH(reloc64);                                                \
+   OUT_BATCH(reloc64 >> 32);                                          \
 } while (0)
 
-#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
+#define ADVANCE_BATCH()                  \
+   assert(__map == brw->batch.map_next); \
+   intel_batchbuffer_advance(brw);       \
+} while (0)
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index d3ab769356c..6d92580e725 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -27,6 +27,7 @@
 
 
 #include "main/mtypes.h"
+#include "main/blit.h"
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/colormac.h"
@@ -43,6 +44,23 @@
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
 
+#define SET_TILING_XY_FAST_COPY_BLT(tiling, tr_mode, type)           \
+({                                                                   \
+   switch (tiling) {                                                 \
+   case I915_TILING_X:                                               \
+      CMD |= type ## _TILED_X;                                       \
+      break;                                                         \
+   case I915_TILING_Y:                                               \
+      if (tr_mode == INTEL_MIPTREE_TRMODE_YS)                        \
+         CMD |= type ## _TILED_64K;                                  \
+      else                                                           \
+         CMD |= type ## _TILED_Y;                                    \
+      break;                                                         \
+   default:                                                          \
+      unreachable("not reached");                                    \
+   }                                                                 \
+})
+
 static void
 intel_miptree_set_alpha_to_one(struct brw_context *brw,
                                struct intel_mipmap_tree *mt,
@@ -75,6 +93,10 @@ static uint32_t
 br13_for_cpp(int cpp)
 {
    switch (cpp) {
+   case 16:
+      return BR13_32323232;
+   case 8:
+      return BR13_16161616;
    case 4:
       return BR13_8888;
    case 2:
@@ -86,6 +108,64 @@ br13_for_cpp(int cpp)
    }
 }
 
+static uint32_t
+get_tr_horizontal_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
+   /* Alignment tables for YF/YS tiled surfaces. */
+   const uint32_t align_2d_yf[] = {64, 64, 32, 32, 16};
+   const uint32_t bpp = cpp * 8;
+   const uint32_t shift = is_src ? 17 : 10;
+   uint32_t align;
+   int i = 0;
+
+   if (tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return 0;
+
+   /* Compute array index. */
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
+   i = ffs(bpp / 8) - 1;
+
+   align = tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+           align_2d_yf[i] :
+           4 * align_2d_yf[i];
+
+   assert(_mesa_is_pow_two(align));
+
+   /* XY_FAST_COPY_BLT doesn't support horizontal alignment of 16. */
+   if (align == 16)
+      align = 32;
+
+   return (ffs(align) - 6) << shift;
+}
+
+static uint32_t
+get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
+   /* Vertical alignment tables for YF/YS tiled surfaces. */
+   const unsigned align_2d_yf[] = {64, 32, 32, 16, 16};
+   const uint32_t bpp = cpp * 8;
+   const uint32_t shift = is_src ? 15 : 8;
+   uint32_t align;
+   int i = 0;
+
+   if (tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return 0;
+
+   /* Compute array index. */
+   assert (bpp >= 8 && bpp <= 128 && _mesa_is_pow_two(bpp));
+   i = ffs(bpp / 8) - 1;
+
+   align = tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+           align_2d_yf[i] :
+           4 * align_2d_yf[i];
+
+   assert(_mesa_is_pow_two(align));
+
+   /* XY_FAST_COPY_BLT doesn't support vertical alignments of 16 and 32. */
+   if (align == 16 || align == 32)
+      align = 64;
+
+   return (ffs(align) - 7) << shift;
+}
+
 /**
  * Emits the packet for switching the blitter from X to Y tiled or back.
  *
@@ -96,9 +176,10 @@ br13_for_cpp(int cpp)
  * tiling state would leak into other unsuspecting applications (like the X
  * server).
  */
-static void
+static uint32_t *
 set_blitter_tiling(struct brw_context *brw,
-                   bool dst_y_tiled, bool src_y_tiled)
+                   bool dst_y_tiled, bool src_y_tiled,
+                   uint32_t *__map)
 {
    assert(brw->gen >= 6);
 
@@ -113,19 +194,19 @@ set_blitter_tiling(struct brw_context *brw,
    OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16 |
              (dst_y_tiled ? BCS_SWCTRL_DST_Y : 0) |
              (src_y_tiled ? BCS_SWCTRL_SRC_Y : 0));
+   return __map;
 }
+#define SET_BLITTER_TILING(...) __map = set_blitter_tiling(__VA_ARGS__, __map)
 
-#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) do {         \
+#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled)              \
       BEGIN_BATCH_BLT(n + ((dst_y_tiled || src_y_tiled) ? 14 : 0));     \
       if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, dst_y_tiled, src_y_tiled);             \
-   } while (0)
+         SET_BLITTER_TILING(brw, dst_y_tiled, src_y_tiled)
 
-#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) do {              \
+#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled)                   \
       if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, false, false);                         \
-      ADVANCE_BATCH();                                                  \
-   } while (0)
+         SET_BLITTER_TILING(brw, false, false);                         \
+      ADVANCE_BATCH()
 
 static int
 blt_pitch(struct intel_mipmap_tree *mt)
@@ -278,9 +359,11 @@ intel_miptree_blit(struct brw_context *brw,
                           src_pitch,
                           src_mt->bo, src_mt->offset,
                           src_mt->tiling,
+                          src_mt->tr_mode,
                           dst_mt->pitch,
                           dst_mt->bo, dst_mt->offset,
                           dst_mt->tiling,
+                          dst_mt->tr_mode,
                           src_x, src_y,
                           dst_x, dst_y,
                           width, height,
@@ -313,6 +396,112 @@ alignment_valid(struct brw_context *brw, unsigned offset, uint32_t tiling)
    return true;
 }
 
+static bool
+can_fast_copy_blit(struct brw_context *brw,
+		   drm_intel_bo *src_buffer,
+                   int16_t src_x, int16_t src_y,
+                   uintptr_t src_offset, uint32_t src_pitch,
+                   uint32_t src_tiling, uint32_t src_tr_mode,
+		   drm_intel_bo *dst_buffer,
+                   int16_t dst_x, int16_t dst_y,
+                   uintptr_t dst_offset, uint32_t dst_pitch,
+                   uint32_t dst_tiling, uint32_t dst_tr_mode,
+                   int16_t w, int16_t h, uint32_t cpp)
+{
+   const bool dst_tiling_none = dst_tiling == I915_TILING_NONE;
+   const bool src_tiling_none = src_tiling == I915_TILING_NONE;
+
+   if (brw->gen < 9)
+      return false;
+
+   if (src_buffer->handle == dst_buffer->handle &&
+       _mesa_regions_overlap(src_x, src_y, src_x + w, src_y + h,
+                             dst_x, dst_y, dst_x + w, dst_y + h))
+      return false;
+
+   /* Enable fast copy blit only if the surfaces are Yf/Ys tiled.
+    * FIXME: Based on performance data, remove this condition later to
+    * enable for all types of surfaces.
+    */
+   if (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
+       dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE)
+      return false;
+
+   /* For all surface types buffers must be cacheline-aligned. */
+   if ((dst_offset | src_offset) & 63)
+      return false;
+
+   /* Color depth greater than 128 bits not supported. */
+   if (cpp > 16)
+      return false;
+
+   /* For Fast Copy Blits the pitch cannot be a negative number. So, bit 15
+    * of the destination pitch must be zero.
+    */
+   if ((src_pitch >> 15 & 1) != 0 || (dst_pitch >> 15 & 1) != 0)
+      return false;
+
+   /* For Linear surfaces, the pitch has to be an OWord (16byte) multiple. */
+   if ((src_tiling_none && src_pitch % 16 != 0) ||
+       (dst_tiling_none && dst_pitch % 16 != 0))
+      return false;
+
+   /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
+    * (X direction width of the Tile). This means the pitch value will
+    * always be Cache Line aligned (64byte multiple).
+    */
+   if ((!dst_tiling_none && dst_pitch % 64 != 0) ||
+       (!src_tiling_none && src_pitch % 64 != 0))
+      return false;
+
+   return true;
+}
+
+static uint32_t
+xy_blit_cmd(uint32_t src_tiling, uint32_t src_tr_mode,
+            uint32_t dst_tiling, uint32_t dst_tr_mode,
+            uint32_t cpp, bool use_fast_copy_blit)
+{
+   uint32_t CMD = 0;
+
+   if (use_fast_copy_blit) {
+      CMD = XY_FAST_COPY_BLT_CMD;
+
+      if (dst_tiling != I915_TILING_NONE)
+         SET_TILING_XY_FAST_COPY_BLT(dst_tiling, dst_tr_mode, XY_FAST_DST);
+
+      if (src_tiling != I915_TILING_NONE)
+         SET_TILING_XY_FAST_COPY_BLT(src_tiling, src_tr_mode, XY_FAST_SRC);
+
+      CMD |= get_tr_horizontal_align(src_tr_mode, cpp, true /* is_src */);
+      CMD |= get_tr_vertical_align(src_tr_mode, cpp, true /* is_src */);
+
+      CMD |= get_tr_horizontal_align(dst_tr_mode, cpp, false /* is_src */);
+      CMD |= get_tr_vertical_align(dst_tr_mode, cpp, false /* is_src */);
+
+   } else {
+      assert(cpp <= 4);
+      switch (cpp) {
+      case 1:
+      case 2:
+         CMD = XY_SRC_COPY_BLT_CMD;
+         break;
+      case 4:
+         CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+         break;
+      default:
+         unreachable("not reached");
+      }
+
+      if (dst_tiling != I915_TILING_NONE)
+         CMD |= XY_DST_TILED;
+
+      if (src_tiling != I915_TILING_NONE)
+         CMD |= XY_SRC_TILED;
+   }
+   return CMD;
+}
+
 /* Copy BitBlt
  */
 bool
@@ -322,10 +511,12 @@ intelEmitCopyBlit(struct brw_context *brw,
 		  drm_intel_bo *src_buffer,
 		  GLuint src_offset,
 		  uint32_t src_tiling,
+		  uint32_t src_tr_mode,
 		  GLshort dst_pitch,
 		  drm_intel_bo *dst_buffer,
 		  GLuint dst_offset,
 		  uint32_t dst_tiling,
+		  uint32_t dst_tr_mode,
 		  GLshort src_x, GLshort src_y,
 		  GLshort dst_x, GLshort dst_y,
 		  GLshort w, GLshort h,
@@ -337,18 +528,11 @@ intelEmitCopyBlit(struct brw_context *brw,
    drm_intel_bo *aper_array[3];
    bool dst_y_tiled = dst_tiling == I915_TILING_Y;
    bool src_y_tiled = src_tiling == I915_TILING_Y;
-
-   if (!alignment_valid(brw, dst_offset, dst_tiling))
-      return false;
-   if (!alignment_valid(brw, src_offset, src_tiling))
-      return false;
+   bool use_fast_copy_blit = false;
 
    if ((dst_y_tiled || src_y_tiled) && brw->gen < 6)
       return false;
 
-   assert(!dst_y_tiled || (dst_pitch % 128) == 0);
-   assert(!src_y_tiled || (src_pitch % 128) == 0);
-
    /* do space check before going any further */
    do {
        aper_array[0] = brw->batch.bo;
@@ -373,52 +557,98 @@ intelEmitCopyBlit(struct brw_context *brw,
        src_buffer, src_pitch, src_offset, src_x, src_y,
        dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
 
-   /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
-    * the low bits.  Offsets must be naturally aligned.
-    */
-   if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
-       dst_pitch % 4 != 0 || dst_offset % cpp != 0)
-      return false;
+   use_fast_copy_blit = can_fast_copy_blit(brw,
+                                           src_buffer,
+                                           src_x, src_y,
+                                           src_offset, src_pitch,
+                                           src_tiling, src_tr_mode,
+                                           dst_buffer,
+                                           dst_x, dst_y,
+                                           dst_offset, dst_pitch,
+                                           dst_tiling, dst_tr_mode,
+                                           w, h, cpp);
+   assert(use_fast_copy_blit ||
+          (src_tr_mode == INTEL_MIPTREE_TRMODE_NONE &&
+           dst_tr_mode == INTEL_MIPTREE_TRMODE_NONE));
 
-   /* For big formats (such as floating point), do the copy using 16 or 32bpp
-    * and multiply the coordinates.
-    */
-   if (cpp > 4) {
-      if (cpp % 4 == 2) {
-         dst_x *= cpp / 2;
-         dst_x2 *= cpp / 2;
-         src_x *= cpp / 2;
-         cpp = 2;
-      } else {
-         assert(cpp % 4 == 0);
-         dst_x *= cpp / 4;
-         dst_x2 *= cpp / 4;
-         src_x *= cpp / 4;
-         cpp = 4;
+   if (use_fast_copy_blit) {
+      /* When two sequential fast copy blits have different source surfaces,
+       * but their destinations refer to the same destination surfaces and
+       * therefore destinations overlap it is imperative that a flush be
+       * inserted between the two blits.
+       *
+       * FIXME: Figure out a way to avoid flushing when not required.
+       */
+      brw_emit_mi_flush(brw);
+
+      assert(cpp <= 16);
+      BR13 = br13_for_cpp(cpp);
+
+      if (src_tr_mode == INTEL_MIPTREE_TRMODE_YF)
+         BR13 |= XY_FAST_SRC_TRMODE_YF;
+
+      if (dst_tr_mode == INTEL_MIPTREE_TRMODE_YF)
+         BR13 |= XY_FAST_DST_TRMODE_YF;
+
+      CMD = xy_blit_cmd(src_tiling, src_tr_mode,
+                        dst_tiling, dst_tr_mode,
+                        cpp, use_fast_copy_blit);
+
+      /* For tiled source and destination, pitch value should be specified
+       * as a number of Dwords.
+       */
+      if (dst_tiling != I915_TILING_NONE)
+         dst_pitch /= 4;
+
+      if (src_tiling != I915_TILING_NONE)
+         src_pitch /= 4;
+
+   } else {
+      assert(!dst_y_tiled || (dst_pitch % 128) == 0);
+      assert(!src_y_tiled || (src_pitch % 128) == 0);
+
+      /* For big formats (such as floating point), do the copy using 16 or
+       * 32bpp and multiply the coordinates.
+       */
+      if (cpp > 4) {
+         if (cpp % 4 == 2) {
+            dst_x *= cpp / 2;
+            dst_x2 *= cpp / 2;
+            src_x *= cpp / 2;
+            cpp = 2;
+         } else {
+            assert(cpp % 4 == 0);
+            dst_x *= cpp / 4;
+            dst_x2 *= cpp / 4;
+            src_x *= cpp / 4;
+            cpp = 4;
+         }
       }
-   }
 
-   BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;
+      if (!alignment_valid(brw, dst_offset, dst_tiling))
+         return false;
+      if (!alignment_valid(brw, src_offset, src_tiling))
+         return false;
 
-   switch (cpp) {
-   case 1:
-   case 2:
-      CMD = XY_SRC_COPY_BLT_CMD;
-      break;
-   case 4:
-      CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      break;
-   default:
-      return false;
-   }
+      /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
+       * the low bits.  Offsets must be naturally aligned.
+       */
+      if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
+          dst_pitch % 4 != 0 || dst_offset % cpp != 0)
+         return false;
 
-   if (dst_tiling != I915_TILING_NONE) {
-      CMD |= XY_DST_TILED;
-      dst_pitch /= 4;
-   }
-   if (src_tiling != I915_TILING_NONE) {
-      CMD |= XY_SRC_TILED;
-      src_pitch /= 4;
+      assert(cpp <= 4);
+      BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;
+
+      CMD = xy_blit_cmd(src_tiling, src_tr_mode,
+                        dst_tiling, dst_tr_mode,
+                        cpp, use_fast_copy_blit);
+
+      if (dst_tiling != I915_TILING_NONE)
+         dst_pitch /= 4;
+
+      if (src_tiling != I915_TILING_NONE)
+         src_pitch /= 4;
    }
 
    if (dst_y2 <= dst_y || dst_x2 <= dst_x) {
@@ -460,7 +690,7 @@ intelEmitCopyBlit(struct brw_context *brw,
 
    ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    return true;
 }
@@ -544,7 +774,7 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
 
    intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    return true;
 }
@@ -576,7 +806,9 @@ intel_emit_linear_blit(struct brw_context *brw,
    dst_x = dst_offset % 64;
    ok = intelEmitCopyBlit(brw, 1,
 			  pitch, src_bo, src_offset - src_x, I915_TILING_NONE,
+                          INTEL_MIPTREE_TRMODE_NONE,
 			  pitch, dst_bo, dst_offset - dst_x, I915_TILING_NONE,
+                          INTEL_MIPTREE_TRMODE_NONE,
 			  src_x, 0, /* src x/y */
 			  dst_x, 0, /* dst x/y */
 			  pitch, height, /* w, h */
@@ -595,7 +827,9 @@ intel_emit_linear_blit(struct brw_context *brw,
    if (size != 0) {
       ok = intelEmitCopyBlit(brw, 1,
 			     pitch, src_bo, src_offset - src_x, I915_TILING_NONE,
+                             INTEL_MIPTREE_TRMODE_NONE,
 			     pitch, dst_bo, dst_offset - dst_x, I915_TILING_NONE,
+                             INTEL_MIPTREE_TRMODE_NONE,
 			     src_x, 0, /* src x/y */
 			     dst_x, 0, /* dst x/y */
 			     size, 1, /* w, h */
@@ -667,5 +901,5 @@ intel_miptree_set_alpha_to_one(struct brw_context *brw,
    OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
    ADVANCE_BATCH_TILED(dst_y_tiled, false);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
index 2287c379c4e..c3d19a5a20e 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -32,19 +32,21 @@
 
 bool
 intelEmitCopyBlit(struct brw_context *brw,
-                              GLuint cpp,
-                              GLshort src_pitch,
-                              drm_intel_bo *src_buffer,
-                              GLuint src_offset,
-			      uint32_t src_tiling,
-                              GLshort dst_pitch,
-                              drm_intel_bo *dst_buffer,
-                              GLuint dst_offset,
-			      uint32_t dst_tiling,
-                              GLshort srcx, GLshort srcy,
-                              GLshort dstx, GLshort dsty,
-                              GLshort w, GLshort h,
-			      GLenum logicop );
+                  GLuint cpp,
+                  GLshort src_pitch,
+                  drm_intel_bo *src_buffer,
+                  GLuint src_offset,
+                  uint32_t src_tiling,
+                  uint32_t src_tr_mode,
+                  GLshort dst_pitch,
+                  drm_intel_bo *dst_buffer,
+                  GLuint dst_offset,
+                  uint32_t dst_tiling,
+                  uint32_t dst_tr_mode,
+                  GLshort srcx, GLshort srcy,
+                  GLshort dstx, GLshort dsty,
+                  GLshort w, GLshort h,
+                  GLenum logicop);
 
 bool intel_miptree_blit_compatible_formats(mesa_format src, mesa_format dst);
 
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 627c487f0e7..ff05b5cd0e7 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -560,7 +560,7 @@ brw_unmap_buffer(struct gl_context *ctx,
        * flush.  Once again, we wish for a domain tracker in libdrm to cover
        * usage inside of a batchbuffer.
        */
-      intel_batchbuffer_emit_mi_flush(brw);
+      brw_emit_mi_flush(brw);
 
       drm_intel_bo_unreference(intel_obj->range_map_bo[index]);
       intel_obj->range_map_bo[index] = NULL;
@@ -632,7 +632,7 @@ brw_copy_buffer_subdata(struct gl_context *ctx,
     * flush.  Once again, we wish for a domain tracker in libdrm to cover
     * usage inside of a batchbuffer.
     */
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/intel_copy_image.c b/src/mesa/drivers/dri/i965/intel_copy_image.c
index f4c7eff2904..3706704bf1a 100644
--- a/src/mesa/drivers/dri/i965/intel_copy_image.c
+++ b/src/mesa/drivers/dri/i965/intel_copy_image.c
@@ -126,9 +126,11 @@ copy_image_with_blitter(struct brw_context *brw,
                             src_mt->pitch,
                             src_mt->bo, src_mt->offset,
                             src_mt->tiling,
+                            src_mt->tr_mode,
                             dst_mt->pitch,
                             dst_mt->bo, dst_mt->offset,
                             dst_mt->tiling,
+                            dst_mt->tr_mode,
                             src_x, src_y,
                             dst_x, dst_y,
                             src_width, src_height,
diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index 75cf7854eff..58f41bfd55d 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -79,11 +79,13 @@ intel_debug_flag_for_shader_stage(gl_shader_stage stage)
 {
    uint64_t flags[] = {
       [MESA_SHADER_VERTEX] = DEBUG_VS,
+      [MESA_SHADER_TESS_CTRL] = 0,
+      [MESA_SHADER_TESS_EVAL] = 0,
       [MESA_SHADER_GEOMETRY] = DEBUG_GS,
       [MESA_SHADER_FRAGMENT] = DEBUG_WM,
       [MESA_SHADER_COMPUTE] = DEBUG_CS,
    };
-   STATIC_ASSERT(MESA_SHADER_STAGES == 4);
+   STATIC_ASSERT(MESA_SHADER_STAGES == 6);
    return flags[stage];
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index c99677c7197..3bc28a12026 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -64,10 +64,10 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   drm_intel_bo_map(brw->batch.workaround_bo, true);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, true);
+   data = brw->workaround_bo->virtual;
    data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    /* Write the register. */
    BEGIN_BATCH(3);
@@ -76,13 +76,13 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    OUT_BATCH(expected_value);
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the register's value back to the buffer. */
    BEGIN_BATCH(3);
    OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    OUT_BATCH(reg);
-   OUT_RELOC(brw->batch.workaround_bo,
+   OUT_RELOC(brw->workaround_bo,
              I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
              offset * sizeof(uint32_t));
    ADVANCE_BATCH();
@@ -90,10 +90,10 @@ can_do_pipelined_register_writes(struct brw_context *brw)
    intel_batchbuffer_flush(brw);
 
    /* Check whether the value got written. */
-   drm_intel_bo_map(brw->batch.workaround_bo, false);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, false);
+   data = brw->workaround_bo->virtual;
    bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    result = success;
 
@@ -120,10 +120,10 @@ can_write_oacontrol(struct brw_context *brw)
    /* Set a value in a BO to a known quantity.  The workaround BO already
     * exists and doesn't contain anything important, so we may as well use it.
     */
-   drm_intel_bo_map(brw->batch.workaround_bo, true);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, true);
+   data = brw->workaround_bo->virtual;
    data[offset] = 0xffffffff;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    /* Write OACONTROL. */
    BEGIN_BATCH(3);
@@ -132,18 +132,18 @@ can_write_oacontrol(struct brw_context *brw)
    OUT_BATCH(expected_value);
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Save the register's value back to the buffer. */
    BEGIN_BATCH(3);
    OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
    OUT_BATCH(OACONTROL);
-   OUT_RELOC(brw->batch.workaround_bo,
+   OUT_RELOC(brw->workaround_bo,
              I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
              offset * sizeof(uint32_t));
    ADVANCE_BATCH();
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 
    /* Set OACONTROL back to zero (everything off). */
    BEGIN_BATCH(3);
@@ -155,10 +155,10 @@ can_write_oacontrol(struct brw_context *brw)
    intel_batchbuffer_flush(brw);
 
    /* Check whether the value got written. */
-   drm_intel_bo_map(brw->batch.workaround_bo, false);
-   data = brw->batch.workaround_bo->virtual;
+   drm_intel_bo_map(brw->workaround_bo, false);
+   data = brw->workaround_bo->virtual;
    bool success = data[offset] == expected_value;
-   drm_intel_bo_unmap(brw->batch.workaround_bo);
+   drm_intel_bo_unmap(brw->workaround_bo);
 
    result = success;
 
@@ -284,8 +284,6 @@ intelInitExtensions(struct gl_context *ctx)
    }
 
    if (brw->gen >= 6) {
-      uint64_t dummy;
-
       ctx->Extensions.ARB_blend_func_extended =
          brw->optionCache.info == NULL ||
          !driQueryOptionb(&brw->optionCache, "disable_blend_func_extended");
@@ -311,13 +309,14 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.OES_depth_texture_cube_map = true;
 
       /* Test if the kernel has the ioctl. */
-      if (brw->bufmgr && drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &dummy) == 0)
+      if (brw->intelScreen->hw_has_timestamp)
          ctx->Extensions.ARB_timer_query = true;
 
       /* Only enable this in core profile because other parts of Mesa behave
        * slightly differently when the extension is enabled.
        */
       if (ctx->API == API_OPENGL_CORE) {
+         ctx->Extensions.ARB_shader_subroutine = true;
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
       }
@@ -331,6 +330,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_framebuffer_no_attachments = true;
       ctx->Extensions.ARB_gpu_shader5 = true;
       ctx->Extensions.ARB_shader_atomic_counters = true;
+      ctx->Extensions.ARB_shader_image_load_store = true;
       ctx->Extensions.ARB_texture_compression_bptc = true;
       ctx->Extensions.ARB_texture_view = true;
 
@@ -351,6 +351,7 @@ intelInitExtensions(struct gl_context *ctx)
       if (ctx->API == API_OPENGL_CORE) {
          ctx->Extensions.ARB_viewport_array = true;
          ctx->Extensions.AMD_vertex_shader_viewport_index = true;
+         ctx->Extensions.ARB_shader_subroutine = true;
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 1b3a72f3ec2..72648b01e33 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -310,7 +310,7 @@ intel_alloc_private_renderbuffer_storage(struct gl_context * ctx, struct gl_rend
    intel_miptree_release(&irb->mt);
 
    DBG("%s: %s: %s (%dx%d)\n", __func__,
-       _mesa_lookup_enum_by_nr(internalFormat),
+       _mesa_enum_to_string(internalFormat),
        _mesa_get_format_name(rb->Format), width, height);
 
    if (width == 0 || height == 0)
@@ -551,10 +551,12 @@ intel_renderbuffer_update_wrapper(struct brw_context *brw,
 
    irb->mt_layer = layer_multiplier * layer;
 
-   if (layered) {
-      irb->layer_count = image->TexObject->NumLayers ?: mt->level[level].depth / layer_multiplier;
-   } else {
+   if (!layered) {
       irb->layer_count = 1;
+   } else if (image->TexObject->NumLayers > 0) {
+      irb->layer_count = image->TexObject->NumLayers;
+   } else {
+      irb->layer_count = mt->level[level].depth / layer_multiplier;
    }
 
    intel_miptree_reference(&irb->mt, mt);
@@ -1020,6 +1022,9 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
    struct intel_mipmap_tree *new_mt;
    int width, height, depth;
 
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                           MIPTREE_LAYOUT_TILING_ANY;
+
    intel_miptree_get_dimensions_for_image(rb->TexImage, &width, &height, &depth);
 
    new_mt = intel_miptree_create(brw, rb->TexImage->TexObject->Target,
@@ -1028,8 +1033,7 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                  intel_image->base.Base.Level,
                                  width, height, depth,
                                  irb->mt->num_samples,
-                                 INTEL_MIPTREE_TILING_ANY,
-                                 MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                 layout_flags);
 
    if (intel_miptree_wants_hiz_buffer(brw, new_mt)) {
       intel_miptree_alloc_hiz(brw, new_mt);
@@ -1076,7 +1080,7 @@ brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
    if (!_mesa_set_search(brw->render_cache, bo))
       return;
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 6aa969a4930..e85c3f00c7b 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -272,7 +272,6 @@ intel_miptree_create_layout(struct brw_context *brw,
                             GLuint height0,
                             GLuint depth0,
                             GLuint num_samples,
-                            enum intel_miptree_tiling_mode requested,
                             uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
@@ -280,7 +279,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       return NULL;
 
    DBG("%s target %s format %s level %d..%d slices %d <-- %p\n", __func__,
-       _mesa_lookup_enum_by_nr(target),
+       _mesa_enum_to_string(target),
        _mesa_get_format_name(format),
        first_level, last_level, depth0, mt);
 
@@ -454,8 +453,10 @@ intel_miptree_create_layout(struct brw_context *brw,
 	(brw->has_separate_stencil &&
          intel_miptree_wants_hiz_buffer(brw, mt)))) {
       uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-      if (brw->gen == 6)
-         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
+      if (brw->gen == 6) {
+         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD |
+                          MIPTREE_LAYOUT_TILING_ANY;
+      }
 
       mt->stencil_mt = intel_miptree_create(brw,
                                             mt->target,
@@ -466,7 +467,6 @@ intel_miptree_create_layout(struct brw_context *brw,
                                             mt->logical_height0,
                                             mt->logical_depth0,
                                             num_samples,
-                                            INTEL_MIPTREE_TILING_ANY,
                                             stencil_flags);
 
       if (!mt->stencil_mt) {
@@ -510,7 +510,7 @@ intel_miptree_create_layout(struct brw_context *brw,
       assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
    }
 
-   brw_miptree_layout(brw, mt, requested, layout_flags);
+   brw_miptree_layout(brw, mt, layout_flags);
 
    if (mt->disable_aux_buffers)
       assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
@@ -558,6 +558,53 @@ intel_lower_compressed_format(struct brw_context *brw, mesa_format format)
    }
 }
 
+/* This function computes Yf/Ys tiled bo size, alignment and pitch. */
+static unsigned long
+intel_get_yf_ys_bo_size(struct intel_mipmap_tree *mt, unsigned *alignment,
+                        unsigned long *pitch)
+{
+   const uint32_t bpp = mt->cpp * 8;
+   const uint32_t aspect_ratio = (bpp == 16 || bpp == 64) ? 2 : 1;
+   uint32_t tile_width, tile_height;
+   unsigned long stride, size, aligned_y;
+
+   assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
+
+   switch (bpp) {
+   case 8:
+      tile_height = 64;
+      break;
+   case 16:
+   case 32:
+      tile_height = 32;
+      break;
+   case 64:
+   case 128:
+      tile_height = 16;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YS)
+      tile_height *= 4;
+
+   aligned_y = ALIGN(mt->total_height, tile_height);
+   stride = mt->total_width * mt->cpp;
+   tile_width = tile_height * mt->cpp * aspect_ratio;
+   stride = ALIGN(stride, tile_width);
+   size = stride * aligned_y;
+
+   if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YF) {
+      assert(size % 4096 == 0);
+      *alignment = 4096;
+   } else {
+      assert(size % (64 * 1024) == 0);
+      *alignment = 64 * 1024;
+   }
+   *pitch = stride;
+   return size;
+}
 
 struct intel_mipmap_tree *
 intel_miptree_create(struct brw_context *brw,
@@ -569,7 +616,6 @@ intel_miptree_create(struct brw_context *brw,
                      GLuint height0,
                      GLuint depth0,
                      GLuint num_samples,
-                     enum intel_miptree_tiling_mode requested_tiling,
                      uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
@@ -587,7 +633,7 @@ intel_miptree_create(struct brw_context *brw,
    mt = intel_miptree_create_layout(brw, target, format,
                                     first_level, last_level, width0,
                                     height0, depth0, num_samples,
-                                    requested_tiling, layout_flags);
+                                    layout_flags);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -616,10 +662,22 @@ intel_miptree_create(struct brw_context *brw,
       alloc_flags |= BO_ALLOC_FOR_RENDER;
 
    unsigned long pitch;
-   mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree", total_width,
-                                     total_height, mt->cpp, &mt->tiling,
-                                     &pitch, alloc_flags);
    mt->etc_format = etc_format;
+
+   if (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
+      unsigned alignment = 0;
+      unsigned long size;
+      size = intel_get_yf_ys_bo_size(mt, &alignment, &pitch);
+      assert(size);
+      mt->bo = drm_intel_bo_alloc_for_render(brw->bufmgr, "miptree",
+                                             size, alignment);
+   } else {
+      mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
+                                        total_width, total_height, mt->cpp,
+                                        &mt->tiling, &pitch,
+                                        alloc_flags);
+   }
+
    mt->pitch = pitch;
 
    /* If the BO is too large to fit in the aperture, we need to use the
@@ -698,17 +756,16 @@ intel_miptree_create_for_bo(struct brw_context *brw,
 
    target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
 
-   /* 'requested' parameter of intel_miptree_create_layout() is relevant
-    * only for non bo miptree. Tiling for bo is already computed above.
-    * So, the tiling requested (INTEL_MIPTREE_TILING_ANY) below is
-    * just a place holder and will not make any change to the miptree
-    * tiling format.
+   /* The BO already has a tiling format and we shouldn't confuse the lower
+    * layers by making it try to find a tiling format again.
     */
+   assert((layout_flags & MIPTREE_LAYOUT_TILING_ANY) == 0);
+   assert((layout_flags & MIPTREE_LAYOUT_TILING_NONE) == 0);
+
    layout_flags |= MIPTREE_LAYOUT_FOR_BO;
    mt = intel_miptree_create_layout(brw, target, format,
                                     0, 0,
                                     width, height, depth, 0,
-                                    INTEL_MIPTREE_TILING_ANY,
                                     layout_flags);
    if (!mt)
       return NULL;
@@ -816,11 +873,13 @@ intel_miptree_create_for_renderbuffer(struct brw_context *brw,
    uint32_t depth = 1;
    bool ok;
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
+   const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                                 MIPTREE_LAYOUT_TILING_ANY;
+
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
                              width, height, depth, num_samples,
-                             INTEL_MIPTREE_TILING_ANY,
-                             MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                             layout_flags);
    if (!mt)
       goto fail;
 
@@ -1325,6 +1384,8 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
     *
     *     "The MCS surface must be stored as Tile Y."
     */
+   const uint32_t mcs_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                              MIPTREE_LAYOUT_TILING_Y;
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1334,8 +1395,7 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
                                      mt->logical_height0,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     INTEL_MIPTREE_TILING_Y,
-                                     MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                     mcs_flags);
 
    /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
     *
@@ -1383,9 +1443,11 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    unsigned mcs_height =
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
-   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
-   if (brw->gen >= 8)
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                           MIPTREE_LAYOUT_TILING_Y;
+   if (brw->gen >= 8) {
       layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
+   }
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1395,7 +1457,6 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                      mcs_height,
                                      mt->logical_depth0,
                                      0 /* num_samples */,
-                                     INTEL_MIPTREE_TILING_Y,
                                      layout_flags);
 
    return mt->mcs_mt;
@@ -1456,21 +1517,23 @@ intel_gen7_hiz_buf_create(struct brw_context *brw,
    /* Gen7 PRM Volume 2, Part 1, 11.5.3 "Hierarchical Depth Buffer" documents
     * adjustments required for Z_Height and Z_Width based on multisampling.
     */
-   switch (mt->num_samples) {
-   case 0:
-   case 1:
-      break;
-   case 2:
-   case 4:
-      z_width *= 2;
-      z_height *= 2;
-      break;
-   case 8:
-      z_width *= 4;
-      z_height *= 2;
-      break;
-   default:
-      unreachable("unsupported sample count");
+   if (brw->gen < 9) {
+      switch (mt->num_samples) {
+      case 0:
+      case 1:
+         break;
+      case 2:
+      case 4:
+         z_width *= 2;
+         z_height *= 2;
+         break;
+      case 8:
+         z_width *= 4;
+         z_height *= 2;
+         break;
+      default:
+         unreachable("unsupported sample count");
+      }
    }
 
    const unsigned vertical_align = 8; /* 'j' in the docs */
@@ -1646,6 +1709,7 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
    if (!buf)
       return NULL;
 
+   layout_flags |= MIPTREE_LAYOUT_TILING_ANY;
    buf->mt = intel_miptree_create(brw,
                                   mt->target,
                                   mt->format,
@@ -1655,7 +1719,6 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                                   mt->logical_height0,
                                   mt->logical_depth0,
                                   mt->num_samples,
-                                  INTEL_MIPTREE_TILING_ANY,
                                   layout_flags);
    if (!buf->mt) {
       free(buf);
@@ -2086,7 +2149,7 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  0, INTEL_MIPTREE_TILING_NONE, 0);
+                                  0, MIPTREE_LAYOUT_TILING_NONE);
 
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index bde6daa4e2d..790d3129207 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -516,12 +516,6 @@ struct intel_mipmap_tree
    GLuint refcount;
 };
 
-enum intel_miptree_tiling_mode {
-   INTEL_MIPTREE_TILING_ANY,
-   INTEL_MIPTREE_TILING_Y,
-   INTEL_MIPTREE_TILING_NONE,
-};
-
 void
 intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
@@ -541,6 +535,11 @@ enum {
    MIPTREE_LAYOUT_FOR_BO                   = 1 << 2,
    MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
    MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
+
+   MIPTREE_LAYOUT_TILING_Y                 = 1 << 5,
+   MIPTREE_LAYOUT_TILING_NONE              = 1 << 6,
+   MIPTREE_LAYOUT_TILING_ANY               = MIPTREE_LAYOUT_TILING_Y |
+                                             MIPTREE_LAYOUT_TILING_NONE,
 };
 
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
@@ -552,7 +551,6 @@ struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLuint height0,
                                                GLuint depth0,
                                                GLuint num_samples,
-                                               enum intel_miptree_tiling_mode,
                                                uint32_t flags);
 
 struct intel_mipmap_tree *
@@ -771,7 +769,6 @@ brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
 void
 brw_miptree_layout(struct brw_context *brw,
                    struct intel_mipmap_tree *mt,
-                   enum intel_miptree_tiling_mode requested,
                    uint32_t layout_flags);
 
 void *intel_miptree_map_raw(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index 30380570d62..3fe506e3cf1 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -247,7 +247,7 @@ intelReadPixels(struct gl_context * ctx,
           * rendered to via a PBO at any point, so it seems better to just
           * flush here unconditionally.
           */
-         intel_batchbuffer_emit_mi_flush(brw);
+         brw_emit_mi_flush(brw);
          return;
       }
 
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index bd14e189da3..b4283da9633 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -47,6 +47,9 @@
 /* Load a value from memory into a register.  Only available on Gen7+. */
 #define GEN7_MI_LOAD_REGISTER_MEM	(CMD_MI | (0x29 << 23))
 # define MI_LOAD_REGISTER_MEM_USE_GGTT		(1 << 22)
+/* Haswell RS control */
+#define MI_RS_CONTROL                   (CMD_MI | (0x6 << 23))
+#define MI_RS_STORE_DATA_IMM            (CMD_MI | (0x2b << 23))
 
 /* Manipulate the predicate bit based on some register values. Only on Gen7+ */
 #define GEN7_MI_PREDICATE		(CMD_MI | (0xC << 23))
@@ -102,6 +105,8 @@
 
 #define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22))
 
+#define XY_FAST_COPY_BLT_CMD             (CMD_2D | (0x42 << 22))
+
 #define XY_TEXT_IMMEDIATE_BLIT_CMD	(CMD_2D | (0x31 << 22))
 # define XY_TEXT_BYTE_PACKED		(1 << 16)
 
@@ -111,10 +116,24 @@
 #define XY_SRC_TILED		(1 << 15)
 #define XY_DST_TILED		(1 << 11)
 
+/* BR00 */
+#define XY_FAST_SRC_TILED_64K        (3 << 20)
+#define XY_FAST_SRC_TILED_Y          (2 << 20)
+#define XY_FAST_SRC_TILED_X          (1 << 20)
+
+#define XY_FAST_DST_TILED_64K        (3 << 13)
+#define XY_FAST_DST_TILED_Y          (2 << 13)
+#define XY_FAST_DST_TILED_X          (1 << 13)
+
 /* BR13 */
 #define BR13_8			(0x0 << 24)
 #define BR13_565		(0x1 << 24)
 #define BR13_8888		(0x3 << 24)
+#define BR13_16161616		(0x4 << 24)
+#define BR13_32323232		(0x5 << 24)
+
+#define XY_FAST_SRC_TRMODE_YF        (1 << 31)
+#define XY_FAST_DST_TRMODE_YF        (1 << 30)
 
 /* Pipeline Statistics Counter Registers */
 #define IA_VERTICES_COUNT               0x2310
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index de14696bd76..a164c6985dc 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -229,6 +229,12 @@ static struct intel_image_format intel_image_formats[] = {
    { __DRI_IMAGE_FOURCC_RGB565, __DRI_IMAGE_COMPONENTS_RGB, 1,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_RGB565, 2 } } },
 
+   { __DRI_IMAGE_FOURCC_R8, __DRI_IMAGE_COMPONENTS_R, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 }, } },
+
+   { __DRI_IMAGE_FOURCC_GR88, __DRI_IMAGE_COMPONENTS_RG, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_GR88, 2 }, } },
+
    { __DRI_IMAGE_FOURCC_YUV410, __DRI_IMAGE_COMPONENTS_Y_U_V, 3,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 2, 2, __DRI_IMAGE_FORMAT_R8, 1 },
@@ -1123,6 +1129,50 @@ intel_detect_swizzling(struct intel_screen *screen)
       return true;
 }
 
+static int
+intel_detect_timestamp(struct intel_screen *screen)
+{
+   uint64_t dummy = 0, last = 0;
+   int upper, lower, loops;
+
+   /* On 64bit systems, some old kernels trigger a hw bug resulting in the
+    * TIMESTAMP register being shifted and the low 32bits always zero.
+    *
+    * More recent kernels offer an interface to read the full 36bits
+    * everywhere.
+    */
+   if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP | 1, &dummy) == 0)
+      return 3;
+
+   /* Determine if we have a 32bit or 64bit kernel by inspecting the
+    * upper 32bits for a rapidly changing timestamp.
+    */
+   if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP, &last))
+      return 0;
+
+   upper = lower = 0;
+   for (loops = 0; loops < 10; loops++) {
+      /* The TIMESTAMP should change every 80ns, so several round trips
+       * through the kernel should be enough to advance it.
+       */
+      if (drm_intel_reg_read(screen->bufmgr, TIMESTAMP, &dummy))
+         return 0;
+
+      upper += (dummy >> 32) != (last >> 32);
+      if (upper > 1) /* beware 32bit counter overflow */
+         return 2; /* upper dword holds the low 32bits of the timestamp */
+
+      lower += (dummy & 0xffffffff) != (last & 0xffffffff);
+      if (lower > 1)
+         return 1; /* timestamp is unshifted */
+
+      last = dummy;
+   }
+
+   /* No advancement? No timestamp! */
+   return 0;
+}
+
 /**
  * Return array of MSAA modes supported by the hardware. The array is
  * zero-terminated and sorted in decreasing order.
@@ -1309,11 +1359,6 @@ set_max_gl_versions(struct intel_screen *screen)
    }
 }
 
-/* drop when libdrm 2.4.61 is released */
-#ifndef I915_PARAM_REVISION
-#define I915_PARAM_REVISION 32
-#endif
-
 static int
 brw_get_revision(int fd)
 {
@@ -1332,6 +1377,11 @@ brw_get_revision(int fd)
    return revision;
 }
 
+/* Drop when RS headers get pulled to libdrm */
+#ifndef I915_PARAM_HAS_RESOURCE_STREAMER
+#define I915_PARAM_HAS_RESOURCE_STREAMER 36
+#endif
+
 /**
  * This is the driver specific part of the createNewScreen entry point.
  * Called when using DRI2.
@@ -1378,6 +1428,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    intelScreen->hw_must_use_separate_stencil = intelScreen->devinfo->gen >= 7;
 
    intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
+   intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
 
    const char *force_msaa = getenv("INTEL_FORCE_MSAA");
    if (force_msaa) {
@@ -1423,6 +1474,15 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    intelScreen->compiler = brw_compiler_create(intelScreen,
                                                intelScreen->devinfo);
 
+   if (intelScreen->devinfo->has_resource_streamer) {
+      int val = -1;
+      getparam.param = I915_PARAM_HAS_RESOURCE_STREAMER;
+      getparam.value = &val;
+
+      drmIoctl(psp->fd, DRM_IOCTL_I915_GETPARAM, &getparam);
+      intelScreen->has_resource_streamer = val > 0;
+   }
+
    return (const __DRIconfig**) intel_screen_make_configs(psp);
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 742b3d30eee..fd5143eecba 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -52,6 +52,13 @@ struct intel_screen
 
    bool hw_has_swizzling;
 
+   int hw_has_timestamp;
+
+   /**
+    * Does the kernel support resource streamer?
+    */
+   bool has_resource_streamer;
+
    /**
     * Does the kernel support context reset notifications?
     */
diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index 3cfa7e593ab..c44c4beceef 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -69,7 +69,7 @@ brw_fence_insert(struct brw_context *brw, struct brw_fence *fence)
    assert(!fence->batch_bo);
    assert(!fence->signalled);
 
-   intel_batchbuffer_emit_mi_flush(brw);
+   brw_emit_mi_flush(brw);
    fence->batch_bo = brw->batch.bo;
    drm_intel_bo_reference(fence->batch_bo);
    intel_batchbuffer_flush(brw);
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index b0181ad1d75..e16b0def0d4 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -145,7 +145,7 @@ intel_alloc_texture_storage(struct gl_context *ctx,
                                               0, levels - 1,
                                               width, height, depth,
                                               num_samples,
-                                              INTEL_MIPTREE_TILING_ANY, 0);
+                                              MIPTREE_LAYOUT_TILING_ANY);
 
       if (intel_texobj->mt == NULL) {
          return false;
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index ebe84b664d4..93a8cdee0cb 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -80,8 +80,7 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
 			       height,
 			       depth,
                                intelImage->base.Base.NumSamples,
-                               INTEL_MIPTREE_TILING_ANY,
-                               layout_flags);
+                               layout_flags | MIPTREE_LAYOUT_TILING_ANY);
 }
 
 static void
@@ -98,8 +97,8 @@ intelTexImage(struct gl_context * ctx,
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
-       _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type),
+       _mesa_enum_to_string(texImage->TexObject->Target),
+       _mesa_enum_to_string(format), _mesa_enum_to_string(type),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    /* Allocate storage for texture data. */
@@ -472,39 +471,44 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
 }
 
 static void
-intel_get_tex_image(struct gl_context *ctx,
-                    GLenum format, GLenum type, GLvoid *pixels,
-                    struct gl_texture_image *texImage) {
+intel_get_tex_sub_image(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage)
+{
    struct brw_context *brw = brw_context(ctx);
    bool ok;
 
    DBG("%s\n", __func__);
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage, 0, 0, 0,
-                                        texImage->Width, texImage->Height,
-                                        texImage->Depth, format, type,
+      if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage,
+                                        xoffset, yoffset, zoffset,
+                                        width, height, depth, format, type,
                                         pixels, &ctx->Pack)) {
          /* Flush to guarantee coherency between the render cache and other
           * caches the PBO could potentially be bound to after this point.
           * See the related comment in intelReadPixels() for a more detailed
           * explanation.
           */
-         intel_batchbuffer_emit_mi_flush(brw);
+         brw_emit_mi_flush(brw);
          return;
       }
 
       perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
    }
 
-   ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, 0, 0,
-                                          texImage->Width, texImage->Height,
+   ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset,
+                                          width, height,
                                           format, type, pixels, &ctx->Pack);
 
    if(ok)
       return;
 
-   _mesa_meta_GetTexImage(ctx, format, type, pixels, texImage);
+   _mesa_meta_GetTexSubImage(ctx, xoffset, yoffset, zoffset,
+                             width, height, depth,
+                             format, type, pixels, texImage);
 
    DBG("%s - DONE\n", __func__);
 }
@@ -515,5 +519,5 @@ intelInitTextureImageFuncs(struct dd_function_table *functions)
    functions->TexImage = intelTexImage;
    functions->EGLImageTargetTexture2D = intel_image_target_texture_2d;
    functions->BindRenderbufferTexImage = intel_bind_renderbuffer_tex_image;
-   functions->GetTexImage = intel_get_tex_image;
+   functions->GetTexSubImage = intel_get_tex_sub_image;
 }
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 7507f7669a0..31e511f0b7b 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -206,8 +206,8 @@ intelTexSubImage(struct gl_context * ctx,
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
-       _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
-       _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type),
+       _mesa_enum_to_string(texImage->TexObject->Target),
+       _mesa_enum_to_string(format), _mesa_enum_to_string(type),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
    ok = _mesa_meta_pbo_TexSubImage(ctx, dims, texImage,
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 4991c2997ef..d3fb252b5d5 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -136,6 +136,8 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                  _mesa_get_format_name(firstImage->base.Base.TexFormat),
                  width, height, depth, validate_last_level + 1);
 
+      const uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD |
+                                    MIPTREE_LAYOUT_TILING_ANY;
       intelObj->mt = intel_miptree_create(brw,
                                           intelObj->base.Target,
 					  firstImage->base.Base.TexFormat,
@@ -145,8 +147,7 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                                           height,
                                           depth,
                                           0 /* num_samples */,
-                                          INTEL_MIPTREE_TILING_ANY,
-                                          MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
+                                          layout_flags);
       if (!intelObj->mt)
          return false;
    }
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 8010fb4f610..ba67bc59e19 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -283,10 +283,10 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
    fs_reg zero(0.0f);
-   bld.ADD(offset(dest, 2), src0, src1);
+   bld.ADD(offset(dest, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dest, src2)
       ->regs_written = 4;
-   bld.CMP(bld.null_reg_f(), offset(dest, 2), zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
index 3ef0cb319eb..1caa0b50ec6 100644
--- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
@@ -367,10 +367,10 @@ TEST_F(saturate_propagation_test, intervening_dest_write)
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
-   bld.ADD(offset(dst0, 2), src0, src1);
+   bld.ADD(offset(dst0, bld, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dst0, src2)
       ->regs_written = 4;
-   set_saturate(true, bld.MOV(dst1, offset(dst0, 2)));
+   set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2)));
 
    /* = Before =
     *
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index 84e43fa75cd..fbd9fa8f19b 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -53,7 +53,8 @@ public:
    }
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir)
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type)
    {
       unreachable("Not reached");
    }
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index de2afd39cfe..a3055fcc851 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -56,7 +56,8 @@ public:
    }
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(ir_variable *ir)
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type)
    {
       unreachable("Not reached");
    }
diff --git a/src/mesa/drivers/dri/nouveau/Makefile.am b/src/mesa/drivers/dri/nouveau/Makefile.am
index 61af95a7dbc..01e34a8e3c3 100644
--- a/src/mesa/drivers/dri/nouveau/Makefile.am
+++ b/src/mesa/drivers/dri/nouveau/Makefile.am
@@ -38,8 +38,8 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS) \
-	$(NOUVEAU_CFLAGS)
+	$(NVVIEUX_CFLAGS)
 
 noinst_LTLIBRARIES = libnouveau_dri.la
 libnouveau_dri_la_SOURCES = $(NOUVEAU_C_FILES)
-libnouveau_dri_la_LIBADD = $(NOUVEAU_LIBS)
+libnouveau_dri_la_LIBADD = $(NVVIEUX_LIBS)
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c b/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
index 0753c3a0019..755de2c4b68 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_swtnl_t.c
@@ -338,7 +338,6 @@ TAG(swtnl_init)(struct gl_context *ctx)
 			   NUM_VERTEX_ATTRS * 4 * sizeof(GLfloat));
 	_tnl_need_projected_coords(ctx, GL_FALSE);
 	_tnl_allow_vertex_fog(ctx, GL_FALSE);
-	_tnl_wakeup(ctx);
 
 	swtnl_alloc_vertices(ctx);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
index c85acec1268..a3fbad07e66 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
@@ -223,6 +223,7 @@ TAG(vbo_render_prims)(struct gl_context *ctx,
 		      GLboolean index_bounds_valid,
 		      GLuint min_index, GLuint max_index,
 		      struct gl_transform_feedback_object *tfb_vertcount,
+                      unsigned stream,
 		      struct gl_buffer_object *indirect);
 
 static GLboolean
@@ -455,6 +456,7 @@ TAG(vbo_render_prims)(struct gl_context *ctx,
 		      GLboolean index_bounds_valid,
 		      GLuint min_index, GLuint max_index,
 		      struct gl_transform_feedback_object *tfb_vertcount,
+                      unsigned stream,
 		      struct gl_buffer_object *indirect)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
@@ -492,6 +494,7 @@ TAG(vbo_check_render_prims)(struct gl_context *ctx,
 			    GLboolean index_bounds_valid,
 			    GLuint min_index, GLuint max_index,
 			    struct gl_transform_feedback_object *tfb_vertcount,
+                            unsigned stream,
 			    struct gl_buffer_object *indirect)
 {
 	struct nouveau_context *nctx = to_nouveau_context(ctx);
@@ -501,12 +504,12 @@ TAG(vbo_check_render_prims)(struct gl_context *ctx,
 	if (nctx->fallback == HWTNL)
 		TAG(vbo_render_prims)(ctx, prims, nr_prims, ib,
 				      index_bounds_valid, min_index, max_index,
-				      tfb_vertcount, indirect);
+				      tfb_vertcount, stream, indirect);
 
 	if (nctx->fallback == SWTNL)
 		_tnl_draw_prims(ctx, prims, nr_prims, ib,
 				index_bounds_valid, min_index, max_index,
-				tfb_vertcount, indirect);
+				tfb_vertcount, stream, indirect);
 }
 
 void
diff --git a/src/mesa/drivers/dri/nouveau/nv04_render.c b/src/mesa/drivers/dri/nouveau/nv04_render.c
index 30e9f9aad96..3b7f7829044 100644
--- a/src/mesa/drivers/dri/nouveau/nv04_render.c
+++ b/src/mesa/drivers/dri/nouveau/nv04_render.c
@@ -285,7 +285,6 @@ nv04_render_init(struct gl_context *ctx)
 	_tnl_init_vertices(ctx, tnl->vb.Size,
 			   NUM_VERTEX_ATTRS * 4 * sizeof(GLfloat));
 	_tnl_allow_pixel_fog(ctx, GL_FALSE);
-	_tnl_wakeup(ctx);
 }
 
 void
diff --git a/src/mesa/drivers/dri/r200/r200_blit.c b/src/mesa/drivers/dri/r200/r200_blit.c
index 3adc69423cd..d68a53e67f7 100644
--- a/src/mesa/drivers/dri/r200/r200_blit.c
+++ b/src/mesa/drivers/dri/r200/r200_blit.c
@@ -28,6 +28,7 @@
 #include "radeon_common.h"
 #include "r200_context.h"
 #include "r200_blit.h"
+#include "r200_tex.h"
 
 static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
                                   int reg, int count)
@@ -40,22 +41,42 @@ static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
 /* common formats supported as both textures and render targets */
 unsigned r200_check_blit(mesa_format mesa_format, uint32_t dst_pitch)
 {
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-    case MESA_FORMAT_B5G6R5_UNORM:
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_L_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-    /* swizzled */
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-    case MESA_FORMAT_R8G8B8A8_UNORM:
+    /* XXX others? */
+    if (_mesa_little_endian()) {
+	switch (mesa_format) {
+	case MESA_FORMAT_B8G8R8A8_UNORM:
+	case MESA_FORMAT_B8G8R8X8_UNORM:
+	case MESA_FORMAT_B5G6R5_UNORM:
+	case MESA_FORMAT_B4G4R4A4_UNORM:
+	case MESA_FORMAT_B5G5R5A1_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	/* swizzled - probably can't happen with the disabled Choose8888TexFormat code */
+	case MESA_FORMAT_A8B8G8R8_UNORM:
+	case MESA_FORMAT_R8G8B8A8_UNORM:
 	    break;
-    default:
+	default:
 	    return 0;
+	}
+    }
+    else {
+	switch (mesa_format) {
+	case MESA_FORMAT_A8R8G8B8_UNORM:
+	case MESA_FORMAT_X8R8G8B8_UNORM:
+	case MESA_FORMAT_R5G6B5_UNORM:
+	case MESA_FORMAT_A4R4G4B4_UNORM:
+	case MESA_FORMAT_A1R5G5B5_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	/* swizzled  - probably can't happen with the disabled Choose8888TexFormat code */
+	case MESA_FORMAT_R8G8B8A8_UNORM:
+	case MESA_FORMAT_A8B8G8R8_UNORM:
+	   break;
+	default:
+	   return 0;
+	}
     }
 
     /* Rendering to small buffer doesn't work.
@@ -112,41 +133,11 @@ static void inline emit_tx_setup(struct r200_context *r200,
     assert(height <= 2048);
     assert(offset % 32 == 0);
 
-    /* XXX others?  BE/LE? */
-    switch (src_mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-	    txformat |= R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_R8G8B8A8_UNORM:
-	    txformat |= R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB8888;
-	    break;
-    case MESA_FORMAT_B5G6R5_UNORM:
-	    txformat |= R200_TXFORMAT_RGB565;
-	    break;
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-	    txformat |= R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-	    txformat |= R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_L_UNORM8:
-	    txformat |= R200_TXFORMAT_I8;
-	    break;
-    case MESA_FORMAT_L8A8_UNORM:
-	    txformat |= R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    default:
-	    break;
+    if (_mesa_little_endian()) {
+	txformat |= tx_table_le[src_mesa_format].format;
+    }
+    else {
+	txformat |= tx_table_be[src_mesa_format].format;
     }
 
     if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
@@ -155,11 +146,19 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	offset |= R200_TXO_MICRO_TILE;
 
     switch (dst_mesa_format) {
+    /* le */
     case MESA_FORMAT_B8G8R8A8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
     case MESA_FORMAT_B5G6R5_UNORM:
     case MESA_FORMAT_B4G4R4A4_UNORM:
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    /* be */
+    case MESA_FORMAT_A8R8G8B8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
+    /* little and big */
     case MESA_FORMAT_A_UNORM8:
     case MESA_FORMAT_L_UNORM8:
     case MESA_FORMAT_I_UNORM8:
@@ -183,6 +182,9 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	    END_BATCH();
 	    break;
     case MESA_FORMAT_A8B8G8R8_UNORM:
+    case MESA_FORMAT_R8G8B8A8_UNORM:
+       if ((dst_mesa_format == MESA_FORMAT_A8B8G8R8_UNORM && _mesa_little_endian()) ||
+	   (dst_mesa_format == MESA_FORMAT_R8G8B8A8_UNORM && !_mesa_little_endian())) {
 	    BEGIN_BATCH(10);
 	    OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE |
 					      RADEON_TEX_BLEND_0_ENABLE));
@@ -190,6 +192,8 @@ static void inline emit_tx_setup(struct r200_context *r200,
 						  R200_TXC_ARG_B_ZERO |
 						  R200_TXC_ARG_C_R0_COLOR |
 						  R200_TXC_OP_MADD));
+	    /* XXX I don't think this can work. This is output rotation, and alpha contains
+	     * red, not alpha (we'd write gbrr). */
 	    OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_0, (R200_TXC_CLAMP_0_1 |
 						   R200_TXC_OUTPUT_ROTATE_GBA |
 						   R200_TXC_OUTPUT_REG_R0));
@@ -201,8 +205,16 @@ static void inline emit_tx_setup(struct r200_context *r200,
 						   (R200_TXA_REPL_RED << R200_TXA_REPL_ARG_C_SHIFT) |
 						   R200_TXA_OUTPUT_REG_R0));
 	    END_BATCH();
-	    break;
-    case MESA_FORMAT_R8G8B8A8_UNORM:
+       }
+       else {
+	    /* XXX pretty sure could do this with just 2 instead of 4 instructions.
+	     * Like so:
+	     * 1st: use RGA output rotation, rgb arg replicate b, a arg r, write mask rb.
+	     * That's just one instruction in fact but I'm not entirely sure it works
+	     * if some of those incoming r0 components are never written (due to mask)
+	     * in the shader itself to r0.
+	     * In any case this case (and the one above) may not be reachable with
+	     * disabled Choose8888TexFormat code. */
 	    BEGIN_BATCH(34);
 	    OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE |
 					      RADEON_TEX_BLEND_0_ENABLE |
@@ -272,7 +284,8 @@ static void inline emit_tx_setup(struct r200_context *r200,
 	    OUT_BATCH_REGVAL(R200_PP_TXABLEND2_3, (R200_TXA_CLAMP_0_1 |
 						   R200_TXA_OUTPUT_REG_R0));
 	    END_BATCH();
-	    break;
+	}
+	break;
     }
 
     BEGIN_BATCH(18);
@@ -306,21 +319,27 @@ static inline void emit_cb_setup(struct r200_context *r200,
     uint32_t dst_format = 0;
     BATCH_LOCALS(&r200->radeon);
 
-    /* XXX others?  BE/LE? */
     switch (mesa_format) {
+    /* The first of each pair is for little, the second for big endian */
     case MESA_FORMAT_B8G8R8A8_UNORM:
+    case MESA_FORMAT_A8R8G8B8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
+    /* These two are valid both for little and big endian (swizzled) */
     case MESA_FORMAT_A8B8G8R8_UNORM:
     case MESA_FORMAT_R8G8B8A8_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
 	    break;
     case MESA_FORMAT_B5G6R5_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_RGB565;
 	    break;
     case MESA_FORMAT_B4G4R4A4_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB4444;
 	    break;
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
 	    break;
     case MESA_FORMAT_A_UNORM8:
@@ -547,5 +566,21 @@ unsigned r200_blit(struct gl_context *ctx,
 
     radeonFlush(ctx);
 
+    /* We submitted those packets outside our state atom mechanism. Thus
+     * make sure the atoms are resubmitted the next time. */
+    r200->hw.cst.dirty = GL_TRUE;
+    r200->hw.ctx.dirty = GL_TRUE;
+    r200->hw.vap.dirty = GL_TRUE;
+    r200->hw.msk.dirty = GL_TRUE;
+    r200->hw.pix[0].dirty = GL_TRUE;
+    r200->hw.pix[1].dirty = GL_TRUE;
+    r200->hw.pix[2].dirty = GL_TRUE;
+    r200->hw.pix[3].dirty = GL_TRUE;
+    r200->hw.sci.dirty = GL_TRUE;
+    r200->hw.set.dirty = GL_TRUE;
+    r200->hw.tex[0].dirty = GL_TRUE;
+    r200->hw.vte.dirty = GL_TRUE;
+    r200->hw.vtx.dirty = GL_TRUE;
+
     return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index fb15082114f..2a42ab3f4c8 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -225,18 +225,9 @@ GLboolean r200CreateContext( gl_api api,
    rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
 							"def_max_anisotropy");
 
-   if ( sPriv->drm_version.major == 1
-       && driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
-      if ( sPriv->drm_version.minor < 13 )
-	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
-			  "disabling.\n", sPriv->drm_version.minor );
-      else
-	 rmesa->using_hyperz = GL_TRUE;
-   }
+   if (driQueryOptionb( &rmesa->radeon.optionCache, "hyperz"))
+      rmesa->using_hyperz = GL_TRUE;
  
-   if ( sPriv->drm_version.minor >= 15 )
-      rmesa->texmicrotile = GL_TRUE;
-
    /* Init default driver functions then plug in our R200-specific functions
     * (the texture functions are especially important)
     */
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index eb498f7406b..c02a4f399ee 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -109,7 +109,6 @@ struct r200_texture_state {
 #define CTX_RB3D_COLOROFFSET  11
 #define CTX_CMD_2             12 /* why */
 #define CTX_RB3D_COLORPITCH   13 /* why */
-#define CTX_STATE_SIZE_OLDDRM 14
 #define CTX_CMD_3             14
 #define CTX_RB3D_BLENDCOLOR   15
 #define CTX_RB3D_ABLENDCNTL   16
@@ -167,9 +166,6 @@ struct r200_texture_state {
 #define TEX_PP_TXSIZE               4  /*2c0c*/
 #define TEX_PP_TXPITCH              5  /*2c10*/
 #define TEX_PP_BORDER_COLOR         6  /*2c14*/
-#define TEX_CMD_1_OLDDRM            7
-#define TEX_PP_TXOFFSET_OLDDRM      8  /*2d00 */
-#define TEX_STATE_SIZE_OLDDRM       9
 #define TEX_PP_CUBIC_FACES          7
 #define TEX_PP_TXMULTI_CTL          8
 #define TEX_CMD_1_NEWDRM            9
@@ -618,7 +614,6 @@ struct r200_context {
    struct r200_swtcl_info swtcl;
 
    GLboolean using_hyperz;
-   GLboolean texmicrotile;
 
   struct ati_fragment_shader *afs_loaded;
 };
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 6fe70b5c9d0..cca176d7f9b 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -1546,7 +1546,7 @@ void r200UpdateWindow( struct gl_context *ctx )
    GLfloat xoffset = 0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->h : 0;
    const GLboolean render_to_fbo = (ctx->DrawBuffer ? _mesa_is_user_fbo(ctx->DrawBuffer) : 0);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    GLfloat y_scale, y_bias;
 
    if (render_to_fbo) {
@@ -1669,7 +1669,7 @@ static void r200Enable( struct gl_context *ctx, GLenum cap, GLboolean state )
 
    if ( R200_DEBUG & RADEON_STATE )
       fprintf( stderr, "%s( %s = %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( cap ),
+	       _mesa_enum_to_string( cap ),
 	       state ? "GL_TRUE" : "GL_FALSE" );
 
    switch ( cap ) {
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index d9d1a0ed227..ad64f788b9f 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -254,7 +254,7 @@ CHECK( never, GL_FALSE, 0 )
 CHECK( tex_any, ctx->Texture._MaxEnabledTexImageUnit != -1, 0 )
 CHECK( tf, (ctx->Texture._MaxEnabledTexImageUnit != -1 && !ctx->ATIFragmentShader._Enabled), 0 );
 CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled, 0 )
-   CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled), 0 )
+CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled), 0 )
 CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)), 0 )
 CHECK( afs, ctx->ATIFragmentShader._Enabled, 0 )
 CHECK( tex_cube, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT, 3 + 3*5 - CUBE_STATE_SIZE )
@@ -453,12 +453,15 @@ static void ctx_emit_cs(struct gl_context *ctx, struct radeon_state_atom *atom)
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
    else switch (rrb->base.Base.Format) {
    case MESA_FORMAT_B5G6R5_UNORM:
+   case MESA_FORMAT_R5G6B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
 	break;
    case MESA_FORMAT_B4G4R4A4_UNORM:
+   case MESA_FORMAT_A4R4G4B4_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
 	break;
    case MESA_FORMAT_B5G5R5A1_UNORM:
+   case MESA_FORMAT_A1R5G5B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
 	break;
    default:
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index 083a1840d9e..feee0b2ba3f 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -68,9 +68,9 @@ static void r200SetTexWrap( radeonTexObjPtr t, GLenum swrap, GLenum twrap, GLenu
    radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s(tex %p) sw %s, tw %s, rw %s\n",
 		__func__, t,
-		_mesa_lookup_enum_by_nr(swrap),
-		_mesa_lookup_enum_by_nr(twrap),
-		_mesa_lookup_enum_by_nr(rwrap));
+		_mesa_enum_to_string(swrap),
+		_mesa_enum_to_string(twrap),
+		_mesa_enum_to_string(rwrap));
 
    t->pp_txfilter &= ~(R200_CLAMP_S_MASK | R200_CLAMP_T_MASK | R200_BORDER_MODE_D3D);
 
@@ -225,8 +225,8 @@ static void r200SetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
    radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 	"%s(tex %p) minf %s, maxf %s, anisotropy %d.\n",
 	__func__, t,
-	_mesa_lookup_enum_by_nr(minf),
-	_mesa_lookup_enum_by_nr(magf),
+	_mesa_enum_to_string(minf),
+	_mesa_enum_to_string(magf),
 	anisotropy);
 
    if ( anisotropy == R200_MAX_ANISO_1_TO_1 ) {
@@ -302,7 +302,7 @@ static void r200TexEnv( struct gl_context *ctx, GLenum target,
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_VERBOSE, "%s( %s )\n",
-	       __func__, _mesa_lookup_enum_by_nr( pname ) );
+	       __func__, _mesa_enum_to_string( pname ) );
 
    /* This is incorrect: Need to maintain this data for each of
     * GL_TEXTURE_{123}D, GL_TEXTURE_RECTANGLE_NV, etc, and switch
@@ -384,7 +384,7 @@ static void r200TexParameter( struct gl_context *ctx,
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_VERBOSE,
 		"%s(%p, tex %p)  pname %s\n",
 		__func__, ctx, texObj,
-	       _mesa_lookup_enum_by_nr( pname ) );
+	       _mesa_enum_to_string( pname ) );
 
    switch ( pname ) {
    case GL_TEXTURE_MIN_FILTER:
@@ -415,7 +415,7 @@ static void r200DeleteTexture(struct gl_context * ctx, struct gl_texture_object
    radeon_print(RADEON_TEXTURE | RADEON_STATE, RADEON_NORMAL,
            "%s( %p (target = %s) )\n", __func__,
 	   (void *)texObj,
-	   _mesa_lookup_enum_by_nr(texObj->Target));
+	   _mesa_enum_to_string(texObj->Target));
 
    if (rmesa) {
       int i;
@@ -473,7 +473,7 @@ static struct gl_texture_object *r200NewTextureObject(struct gl_context * ctx,
    radeon_print(RADEON_STATE | RADEON_TEXTURE, RADEON_NORMAL,
            "%s(%p) target %s, new texture %p.\n",
 	   __func__, ctx,
-	   _mesa_lookup_enum_by_nr(target), t);
+	   _mesa_enum_to_string(target), t);
 
    _mesa_initialize_texture_object(ctx, &t->base, name, target);
    t->base.Sampler.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
diff --git a/src/mesa/drivers/dri/r200/r200_tex.h b/src/mesa/drivers/dri/r200/r200_tex.h
index d7e91d1a0c8..a8c31b741ed 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.h
+++ b/src/mesa/drivers/dri/r200/r200_tex.h
@@ -52,4 +52,68 @@ extern void r200TexUpdateParameters(struct gl_context *ctx, GLuint unit);
 
 extern void set_re_cntl_d3d( struct gl_context *ctx, int unit, GLboolean use_d3d );
 
+struct tx_table {
+   GLuint format, filter;
+};
+
+/* Note the tables (have to) contain invalid entries (if they are only valid
+ * for either be/le) */
+static const struct tx_table tx_table_be[] =
+{
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YVYU422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_VYUY422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_DXT23 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_DXT45 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+static const struct tx_table tx_table_le[] =
+{
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { R200_TXFORMAT_ARGB8888, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YVYU422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_VYUY422, R200_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_DXT23 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_DXT45 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+
+
 #endif /* __R200_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index ab84d1752ba..441ac730d4c 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -49,80 +49,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_tex.h"
 #include "r200_tcl.h"
 
-
-#define R200_TXFORMAT_A8        R200_TXFORMAT_I8
-#define R200_TXFORMAT_L8        R200_TXFORMAT_I8
-#define R200_TXFORMAT_AL88      R200_TXFORMAT_AI88
-#define R200_TXFORMAT_YCBCR     R200_TXFORMAT_YVYU422
-#define R200_TXFORMAT_YCBCR_REV R200_TXFORMAT_VYUY422
-#define R200_TXFORMAT_RGB_DXT1  R200_TXFORMAT_DXT1
-#define R200_TXFORMAT_RGBA_DXT1 R200_TXFORMAT_DXT1
-#define R200_TXFORMAT_RGBA_DXT3 R200_TXFORMAT_DXT23
-#define R200_TXFORMAT_RGBA_DXT5 R200_TXFORMAT_DXT45
-
 #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
                              && (tx_table_be[f].format != 0xffffffff) )
 
-struct tx_table {
-   GLuint format, filter;
-};
-
-static const struct tx_table tx_table_be[] =
-{
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_A8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YCBCR, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_YCBCR_REV, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_RGBA_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_RGBA_DXT3 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_RGBA_DXT5 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
-static const struct tx_table tx_table_le[] =
-{
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { R200_TXFORMAT_ARGB8888, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { R200_TXFORMAT_AL88 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { R200_TXFORMAT_A8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { R200_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { R200_TXFORMAT_YCBCR, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { R200_TXFORMAT_YCBCR_REV, R200_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { R200_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { R200_TXFORMAT_RGBA_DXT1 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { R200_TXFORMAT_RGBA_DXT3 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { R200_TXFORMAT_RGBA_DXT5 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
 /* ================================================================
  * Texture combine functions
  */
diff --git a/src/mesa/drivers/dri/radeon/radeon_blit.c b/src/mesa/drivers/dri/radeon/radeon_blit.c
index 0de17514e05..0b0f06f0edb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_blit.c
+++ b/src/mesa/drivers/dri/radeon/radeon_blit.c
@@ -28,6 +28,7 @@
 #include "radeon_common.h"
 #include "radeon_context.h"
 #include "radeon_blit.h"
+#include "radeon_tex.h"
 
 static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
                                   int reg, int count)
@@ -40,19 +41,36 @@ static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
 /* common formats supported as both textures and render targets */
 unsigned r100_check_blit(mesa_format mesa_format, uint32_t dst_pitch)
 {
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-    case MESA_FORMAT_B5G6R5_UNORM:
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_L_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
+    /* XXX others?  */
+    if (_mesa_little_endian()) {
+	switch (mesa_format) {
+	case MESA_FORMAT_B8G8R8A8_UNORM:
+	case MESA_FORMAT_B8G8R8X8_UNORM:
+	case MESA_FORMAT_B5G6R5_UNORM:
+	case MESA_FORMAT_B4G4R4A4_UNORM:
+	case MESA_FORMAT_B5G5R5A1_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
 	    break;
-    default:
+	default:
 	    return 0;
+	}
+    }
+    else {
+	switch (mesa_format) {
+	case MESA_FORMAT_A8R8G8B8_UNORM:
+	case MESA_FORMAT_X8R8G8B8_UNORM:
+	case MESA_FORMAT_R5G6B5_UNORM:
+	case MESA_FORMAT_A4R4G4B4_UNORM:
+	case MESA_FORMAT_A1R5G5B5_UNORM:
+	case MESA_FORMAT_A_UNORM8:
+	case MESA_FORMAT_L_UNORM8:
+	case MESA_FORMAT_I_UNORM8:
+	    break;
+	default:
+	    return 0;
+	}
     }
 
     /* Rendering to small buffer doesn't work.
@@ -106,40 +124,8 @@ static void inline emit_tx_setup(struct r100_context *r100,
     assert(height <= 2048);
     assert(offset % 32 == 0);
 
-    /* XXX others?  BE/LE? */
-    switch (mesa_format) {
-    case MESA_FORMAT_B8G8R8A8_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A8B8G8R8_UNORM:
-            txformat |= RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-            break;
-    case MESA_FORMAT_B8G8R8X8_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB8888;
-	    break;
-    case MESA_FORMAT_B5G6R5_UNORM:
-	    txformat |= RADEON_TXFORMAT_RGB565;
-	    break;
-    case MESA_FORMAT_B4G4R4A4_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_B5G5R5A1_UNORM:
-	    txformat |= RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_A_UNORM8:
-    case MESA_FORMAT_I_UNORM8:
-	    txformat |= RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-	    break;
-    case MESA_FORMAT_L_UNORM8:
-            txformat |= RADEON_TXFORMAT_I8;
-            break;
-    case MESA_FORMAT_L8A8_UNORM:
-            txformat |= RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-            break;
-    default:
-	    break;
-    }
-    
+    txformat |= tx_table[mesa_format].format;
+
     if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
        offset |= RADEON_TXO_MACRO_TILE;
     if (bo->flags & RADEON_BO_FLAGS_MICRO_TILE)
@@ -184,19 +170,25 @@ static inline void emit_cb_setup(struct r100_context *r100,
     uint32_t dst_format = 0;
     BATCH_LOCALS(&r100->radeon);
 
-    /* XXX others?  BE/LE? */
+    /* XXX others? */
     switch (mesa_format) {
+    /* The first of each pair is for little, the second for big endian. */
     case MESA_FORMAT_B8G8R8A8_UNORM:
+    case MESA_FORMAT_A8R8G8B8_UNORM:
     case MESA_FORMAT_B8G8R8X8_UNORM:
+    case MESA_FORMAT_X8R8G8B8_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
 	    break;
     case MESA_FORMAT_B5G6R5_UNORM:
+    case MESA_FORMAT_R5G6B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_RGB565;
 	    break;
     case MESA_FORMAT_B4G4R4A4_UNORM:
+    case MESA_FORMAT_A4R4G4B4_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB4444;
 	    break;
     case MESA_FORMAT_B5G5R5A1_UNORM:
+    case MESA_FORMAT_A1R5G5B5_UNORM:
 	    dst_format = RADEON_COLOR_FORMAT_ARGB1555;
 	    break;
     case MESA_FORMAT_A_UNORM8:
@@ -425,5 +417,13 @@ unsigned r100_blit(struct gl_context *ctx,
 
     radeonFlush(ctx);
 
+    /* We submitted those packets outside our state atom mechanism. Thus
+     * make sure they are all resubmitted the next time. */
+    r100->hw.ctx.dirty = GL_TRUE;
+    r100->hw.msk.dirty = GL_TRUE;
+    r100->hw.set.dirty = GL_TRUE;
+    r100->hw.tex[0].dirty = GL_TRUE;
+    r100->hw.txr[0].dirty = GL_TRUE;
+
     return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 2a8bd6c9edc..fde89214ed2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -164,7 +164,7 @@ uint32_t radeonGetAge(radeonContextPtr radeon)
 
 	gp.param = RADEON_PARAM_LAST_CLEAR;
 	gp.value = (int *)&age;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+	ret = drmCommandWriteRead(radeon->radeonScreen->driScreen->fd, DRM_RADEON_GETPARAM,
 				  &gp, sizeof(gp));
 	if (ret) {
 		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __func__,
@@ -343,7 +343,7 @@ void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
 {
 	if (RADEON_DEBUG & RADEON_DRI)
 		fprintf(stderr, "%s %s\n", __func__,
-			_mesa_lookup_enum_by_nr( mode ));
+			_mesa_enum_to_string( mode ));
 
 	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
@@ -358,8 +358,8 @@ void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
        * that the front-buffer has actually been allocated.
        */
 		if (!was_front_buffer_rendering && radeon->is_front_buffer_rendering) {
-			radeon_update_renderbuffers(radeon->dri.context,
-				radeon->dri.context->driDrawablePriv, GL_FALSE);
+			radeon_update_renderbuffers(radeon->driContext,
+				radeon->driContext->driDrawablePriv, GL_FALSE);
       }
 	}
 
@@ -375,8 +375,8 @@ void radeonReadBuffer( struct gl_context *ctx, GLenum mode )
 					|| (mode == GL_FRONT);
 
 		if (!was_front_buffer_reading && rmesa->is_front_buffer_reading) {
-			radeon_update_renderbuffers(rmesa->dri.context,
-						    rmesa->dri.context->driReadablePriv, GL_FALSE);
+			radeon_update_renderbuffers(rmesa->driContext,
+						    rmesa->driContext->driReadablePriv, GL_FALSE);
 	 	}
 	}
 	/* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
@@ -399,7 +399,7 @@ void radeon_window_moved(radeonContextPtr radeon)
 void radeon_viewport(struct gl_context *ctx)
 {
 	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	__DRIcontext *driContext = radeon->dri.context;
+	__DRIcontext *driContext = radeon->driContext;
 	void (*old_viewport)(struct gl_context *ctx);
 
 	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
@@ -693,6 +693,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 {
 	GLuint size;
 	struct drm_radeon_gem_info mminfo = { 0 };
+	int fd = rmesa->radeonScreen->driScreen->fd;
 
 	/* Initialize command buffer */
 	size = 256 * driQueryOptioni(&rmesa->optionCache,
@@ -711,8 +712,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 			"Allocating %d bytes command buffer (max state is %d bytes)\n",
 			size * 4, rmesa->hw.max_state_size * 4);
 
-	rmesa->cmdbuf.csm =
-		radeon_cs_manager_gem_ctor(rmesa->radeonScreen->driScreen->fd);
+	rmesa->cmdbuf.csm = radeon_cs_manager_gem_ctor(fd);
 	if (rmesa->cmdbuf.csm == NULL) {
 		/* FIXME: fatal error */
 		return;
@@ -725,7 +725,7 @@ void rcommonInitCmdBuf(radeonContextPtr rmesa)
 				  (void (*)(void *))rmesa->glCtx.Driver.Flush, &rmesa->glCtx);
 
 
-	if (!drmCommandWriteRead(rmesa->dri.fd, DRM_RADEON_GEM_INFO,
+	if (!drmCommandWriteRead(fd, DRM_RADEON_GEM_INFO,
 				 &mminfo, sizeof(mminfo))) {
 		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM,
 				    mminfo.vram_visible);
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 9699dcbfcdc..4660d98c9a2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -162,10 +162,7 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 	_mesa_meta_init(ctx);
 
 	/* DRI fields */
-	radeon->dri.context = driContextPriv;
-	radeon->dri.screen = sPriv;
-	radeon->dri.fd = sPriv->fd;
-	radeon->dri.drmMinor = sPriv->drm_version.minor;
+	radeon->driContext = driContextPriv;
 
 	/* Setup IRQs */
 	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
@@ -194,6 +191,29 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 
 	radeon_init_dma(radeon);
 
+        /* _mesa_initialize_context calls _mesa_init_queryobj which
+         * initializes all of the counter sizes to 64.  The counters on r100
+         * and r200 are only 32-bits for occlusion queries.  Those are the
+         * only counters, so set the other sizes to zero.
+         */
+        radeon->glCtx.Const.QueryCounterBits.SamplesPassed = 32;
+
+        radeon->glCtx.Const.QueryCounterBits.TimeElapsed = 0;
+        radeon->glCtx.Const.QueryCounterBits.Timestamp = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesGenerated = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesWritten = 0;
+        radeon->glCtx.Const.QueryCounterBits.VerticesSubmitted = 0;
+        radeon->glCtx.Const.QueryCounterBits.PrimitivesSubmitted = 0;
+        radeon->glCtx.Const.QueryCounterBits.VsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.TessPatches = 0;
+        radeon->glCtx.Const.QueryCounterBits.TessInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.GsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.GsPrimitives = 0;
+        radeon->glCtx.Const.QueryCounterBits.FsInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.ComputeInvocations = 0;
+        radeon->glCtx.Const.QueryCounterBits.ClInPrimitives = 0;
+        radeon->glCtx.Const.QueryCounterBits.ClOutPrimitives = 0;
+
 	return GL_TRUE;
 }
 
@@ -302,7 +322,7 @@ radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
  */
 void radeon_prepare_render(radeonContextPtr radeon)
 {
-    __DRIcontext *driContext = radeon->dri.context;
+    __DRIcontext *driContext = radeon->driContext;
     __DRIdrawable *drawable;
     __DRIscreen *screen;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index dc72592b90c..d142a871b40 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -342,17 +342,6 @@ struct radeon_store {
 	int elts_start;
 };
 
-struct radeon_dri_mirror {
-	__DRIcontext *context;	/* DRI context */
-	__DRIscreen *screen;	/* DRI screen */
-
-	drm_context_t hwContext;
-	drm_hw_lock_t *hwLock;
-	int hwLockCount;
-	int fd;
-	int drmMinor;
-};
-
 typedef void (*radeon_tri_func) (radeonContextPtr,
 				 radeonVertex *,
 				 radeonVertex *, radeonVertex *);
@@ -385,6 +374,7 @@ struct radeon_cmdbuf {
 
 struct radeon_context {
    struct gl_context glCtx;             /**< base class, must be first */
+   __DRIcontext *driContext;               /* DRI context */
    radeonScreenPtr radeonScreen;	/* Screen private DRI data */
 
    /* Texture object bookkeeping
@@ -407,9 +397,6 @@ struct radeon_context {
    /* Drawable information */
    unsigned int lastStamp;
 
-   /* Mirrors of some DRI state */
-   struct radeon_dri_mirror dri;
-
    /* Busy waiting */
    GLuint do_usleeps;
    GLuint do_irqs;
@@ -502,12 +489,12 @@ static inline radeonContextPtr RADEON_CONTEXT(struct gl_context *ctx)
 
 static inline __DRIdrawable* radeon_get_drawable(radeonContextPtr radeon)
 {
-	return radeon->dri.context->driDrawablePriv;
+	return radeon->driContext->driDrawablePriv;
 }
 
 static inline __DRIdrawable* radeon_get_readable(radeonContextPtr radeon)
 {
-	return radeon->dri.context->driReadablePriv;
+	return radeon->driContext->driReadablePriv;
 }
 
 extern const char const *radeonVendorString;
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index d4d19354b6d..a9e2ab563d3 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -191,16 +191,8 @@ r100CreateContext( gl_api api,
    rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
                                                  "def_max_anisotropy");
 
-   if ( driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
-      if ( sPriv->drm_version.minor < 13 )
-	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
-			  "disabling.\n", sPriv->drm_version.minor );
-      else
-	 rmesa->using_hyperz = GL_TRUE;
-   }
-
-   if ( sPriv->drm_version.minor >= 15 )
-      rmesa->texmicrotile = GL_TRUE;
+   if (driQueryOptionb(&rmesa->radeon.optionCache, "hyperz"))
+      rmesa->using_hyperz = GL_TRUE;
 
    /* Init default driver functions then plug in our Radeon-specific functions
     * (the texture functions are especially important)
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index 40325327813..badabd9508c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -426,7 +426,6 @@ struct r100_context {
 	struct r100_swtcl_info swtcl;
 
 	GLboolean using_hyperz;
-	GLboolean texmicrotile;
 
 	/* Performance counters
 	 */
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index ef62d097bae..5eece518c95 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -169,6 +169,7 @@ radeon_map_renderbuffer_s8z24(struct gl_context *ctx,
     rrb->map_buffer = malloc(w * h * 4);
     ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
     assert(!ret);
+    (void) ret;
     untiled_s8z24_map = rrb->map_buffer;
     tiled_s8z24_map = rrb->bo->ptr;
 
@@ -207,6 +208,7 @@ radeon_map_renderbuffer_z16(struct gl_context *ctx,
     rrb->map_buffer = malloc(w * h * 2);
     ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
     assert(!ret);
+    (void) ret;
 
     untiled_z16_map = rrb->map_buffer;
     tiled_z16_map = rrb->bo->ptr;
@@ -324,6 +326,7 @@ radeon_map_renderbuffer(struct gl_context *ctx,
 
    ret = radeon_bo_map(rrb->bo, !!(mode & GL_MAP_WRITE_BIT));
    assert(!ret);
+   (void) ret;
 
    map = rrb->bo->ptr;
    stride = rrb->map_pitch;
@@ -416,7 +419,6 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
 {
    struct radeon_context *const rmesa = RADEON_CONTEXT(ctx);
    struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
-   GLboolean ok;
 
    if ((rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_DEPTH_ALWAYS_TILED) && !rrb->has_surface) {
        if (rb->Format == MESA_FORMAT_Z24_UNORM_S8_UINT || rb->Format == MESA_FORMAT_Z24_UNORM_X8_UINT) {
@@ -438,6 +440,7 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
    radeon_bo_unmap(rrb->map_bo);
 
    if (rrb->map_mode & GL_MAP_WRITE_BIT) {
+      GLboolean ok;
       ok = rmesa->vtbl.blit(ctx, rrb->map_bo, 0,
 			    rb->Format, rrb->map_pitch / rrb->cpp,
 			    rrb->map_w, rrb->map_h,
@@ -449,6 +452,7 @@ radeon_unmap_renderbuffer(struct gl_context *ctx,
 			    rrb->map_w, rrb->map_h,
 			    GL_FALSE);
       assert(ok);
+      (void) ok;
    }
 
    radeon_bo_unref(rrb->map_bo);
@@ -700,7 +704,7 @@ radeon_bind_framebuffer(struct gl_context * ctx, GLenum target,
   radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s(%p, fb %p, target %s) \n",
 		__func__, ctx, fb,
-		_mesa_lookup_enum_by_nr(target));
+		_mesa_enum_to_string(target));
 
    if (target == GL_FRAMEBUFFER_EXT || target == GL_DRAW_FRAMEBUFFER_EXT) {
       radeon_draw_buffer(ctx, fb);
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index 28591cad895..c71766d0a3e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -276,7 +276,7 @@ static void calculate_min_max_lod(struct gl_sampler_object *samp, struct gl_text
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 			"%s(%p) target %s, min %d, max %d.\n",
 			__func__, tObj,
-			_mesa_lookup_enum_by_nr(tObj->Target),
+			_mesa_enum_to_string(tObj->Target),
 			minLod, maxLod);
 
 	/* save these values */
diff --git a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
index 6998444fb66..e115b749da5 100644
--- a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
+++ b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
@@ -212,7 +212,7 @@ radeonReadPixels(struct gl_context * ctx,
      */
     radeon_print(RADEON_FALLBACKS, RADEON_NORMAL,
                  "Falling back to sw for ReadPixels (format %s, type %s)\n",
-                 _mesa_lookup_enum_by_nr(format), _mesa_lookup_enum_by_nr(type));
+                 _mesa_enum_to_string(format), _mesa_enum_to_string(type));
 
     if (ctx->NewState)
         _mesa_update_state(ctx);
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 45d9b2b8c0b..98b4741b456 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -135,36 +135,26 @@ DRI_CONF_END
 static int
 radeonGetParam(__DRIscreen *sPriv, int param, void *value)
 {
-  int ret;
-  drm_radeon_getparam_t gp = { 0 };
   struct drm_radeon_info info = { 0 };
 
-  if (sPriv->drm_version.major >= 2) {
-      info.value = (uint64_t)(uintptr_t)value;
-      switch (param) {
-      case RADEON_PARAM_DEVICE_ID:
-          info.request = RADEON_INFO_DEVICE_ID;
-          break;
-      case RADEON_PARAM_NUM_GB_PIPES:
-          info.request = RADEON_INFO_NUM_GB_PIPES;
-          break;
-      case RADEON_PARAM_NUM_Z_PIPES:
-          info.request = RADEON_INFO_NUM_Z_PIPES;
-          break;
-      case RADEON_INFO_TILE_CONFIG:
-	  info.request = RADEON_INFO_TILE_CONFIG;
-          break;
-      default:
-          return -EINVAL;
-      }
-      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_INFO, &info, sizeof(info));
-  } else {
-      gp.param = param;
-      gp.value = value;
-
-      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+  info.value = (uint64_t)(uintptr_t)value;
+  switch (param) {
+  case RADEON_PARAM_DEVICE_ID:
+    info.request = RADEON_INFO_DEVICE_ID;
+    break;
+  case RADEON_PARAM_NUM_GB_PIPES:
+    info.request = RADEON_INFO_NUM_GB_PIPES;
+    break;
+  case RADEON_PARAM_NUM_Z_PIPES:
+    info.request = RADEON_INFO_NUM_Z_PIPES;
+    break;
+  case RADEON_INFO_TILE_CONFIG:
+    info.request = RADEON_INFO_TILE_CONFIG;
+    break;
+  default:
+    return -EINVAL;
   }
-  return ret;
+  return drmCommandWriteRead(sPriv->fd, DRM_RADEON_INFO, &info, sizeof(info));
 }
 
 #if defined(RADEON_R100)
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index cba3d9c9689..74c1fc6c902 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -1354,7 +1354,7 @@ void radeonUpdateWindow( struct gl_context *ctx )
    GLfloat xoffset = 0.0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->h : 0;
    const GLboolean render_to_fbo = (ctx->DrawBuffer ? _mesa_is_user_fbo(ctx->DrawBuffer) : 0);
-   double scale[3], translate[3];
+   float scale[3], translate[3];
    GLfloat y_scale, y_bias;
 
    if (render_to_fbo) {
@@ -1452,7 +1452,7 @@ static void radeonEnable( struct gl_context *ctx, GLenum cap, GLboolean state )
 
    if ( RADEON_DEBUG & RADEON_STATE )
       fprintf( stderr, "%s( %s = %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( cap ),
+	       _mesa_enum_to_string( cap ),
 	       state ? "GL_TRUE" : "GL_FALSE" );
 
    switch ( cap ) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index c800edfc7be..5e2f41fdb4a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -336,12 +336,15 @@ static void ctx_emit_cs(struct gl_context *ctx, struct radeon_state_atom *atom)
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
    else switch (rrb->base.Base.Format) {
    case MESA_FORMAT_B5G6R5_UNORM:
+   case MESA_FORMAT_R5G6B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
 	break;
    case MESA_FORMAT_B4G4R4A4_UNORM:
+   case MESA_FORMAT_A4R4G4B4_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
 	break;
    case MESA_FORMAT_B5G5R5A1_UNORM:
+   case MESA_FORMAT_A1R5G5B5_UNORM:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
 	break;
    default:
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index 8a1fbab39f8..2fbd353297b 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -442,7 +442,7 @@ static GLboolean radeon_run_render( struct gl_context *ctx,
 
       radeon_print(RADEON_SWRENDER, RADEON_NORMAL,
 	  "radeon_render.c: prim %s %d..%d\n",
-		 _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
+		 _mesa_enum_to_string(prim & PRIM_MODE_MASK), 
 		 start, start+length);
 
       if (length)
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index 353fdb00ec8..0955a135de8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -263,7 +263,7 @@ static void radeonTexEnv( struct gl_context *ctx, GLenum target,
 
    if ( RADEON_DEBUG & RADEON_STATE ) {
       fprintf( stderr, "%s( %s )\n",
-	       __func__, _mesa_lookup_enum_by_nr( pname ) );
+	       __func__, _mesa_enum_to_string( pname ) );
    }
 
    switch ( pname ) {
@@ -335,7 +335,7 @@ static void radeonTexParameter( struct gl_context *ctx,
    radeonTexObj* t = radeon_tex_obj(texObj);
 
    radeon_print(RADEON_TEXTURE, RADEON_VERBOSE, "%s( %s )\n", __func__,
-	       _mesa_lookup_enum_by_nr( pname ) );
+	       _mesa_enum_to_string( pname ) );
 
    switch ( pname ) {
    case GL_TEXTURE_BASE_LEVEL:
@@ -359,7 +359,7 @@ static void radeonDeleteTexture( struct gl_context *ctx,
 
    radeon_print(RADEON_TEXTURE, RADEON_NORMAL,
 	 "%s( %p (target = %s) )\n", __func__, (void *)texObj,
-	       _mesa_lookup_enum_by_nr( texObj->Target ) );
+	       _mesa_enum_to_string( texObj->Target ) );
 
    if ( rmesa ) {
      radeon_firevertices(&rmesa->radeon);
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h b/src/mesa/drivers/dri/radeon/radeon_tex.h
index fa57c08987d..f8ec432755a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
@@ -51,4 +51,39 @@ extern void radeonTexUpdateParameters(struct gl_context *ctx, GLuint unit);
 
 extern void radeonInitTextureFuncs( radeonContextPtr radeon, struct dd_function_table *functions );
 
+struct tx_table {
+   GLuint format, filter;
+};
+
+/* XXX verify this table against MESA_FORMAT_x values */
+static const struct tx_table tx_table[] =
+{
+   [ MESA_FORMAT_NONE ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_BGR_UNORM8 ] = { RADEON_TXFORMAT_ARGB8888, 0 },
+   [ MESA_FORMAT_B5G6R5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_R5G6B5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
+   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L8A8_UNORM ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A8L8_UNORM ] = { RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_A_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_L_UNORM8 ] = { RADEON_TXFORMAT_I8, 0 },
+   [ MESA_FORMAT_I_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_YCBCR ] = { RADEON_TXFORMAT_YVYU422, RADEON_YUV_TO_RGB },
+   [ MESA_FORMAT_YCBCR_REV ] = { RADEON_TXFORMAT_VYUY422, RADEON_YUV_TO_RGB },
+   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
+   [ MESA_FORMAT_RGB_DXT1 ] = { RADEON_TXFORMAT_DXT1, 0 },
+   [ MESA_FORMAT_RGBA_DXT1 ] = { RADEON_TXFORMAT_DXT1 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT3 ] = { RADEON_TXFORMAT_DXT23 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_RGBA_DXT5 ] = { RADEON_TXFORMAT_DXT45 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+};
+
+
 #endif /* __RADEON_TEX_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index 45667efb65f..ec835f248eb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -53,53 +53,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_tcl.h"
 
 
-#define RADEON_TXFORMAT_A8        RADEON_TXFORMAT_I8
-#define RADEON_TXFORMAT_L8        RADEON_TXFORMAT_I8
-#define RADEON_TXFORMAT_AL88      RADEON_TXFORMAT_AI88
-#define RADEON_TXFORMAT_YCBCR     RADEON_TXFORMAT_YVYU422
-#define RADEON_TXFORMAT_YCBCR_REV RADEON_TXFORMAT_VYUY422
-#define RADEON_TXFORMAT_RGB_DXT1  RADEON_TXFORMAT_DXT1
-#define RADEON_TXFORMAT_RGBA_DXT1 RADEON_TXFORMAT_DXT1
-#define RADEON_TXFORMAT_RGBA_DXT3 RADEON_TXFORMAT_DXT23
-#define RADEON_TXFORMAT_RGBA_DXT5 RADEON_TXFORMAT_DXT45
-
 #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
 			     && (tx_table[f].format != 0xffffffff) )
 
-struct tx_table {
-   GLuint format, filter;
-};
-
-/* XXX verify this table against MESA_FORMAT_x values */
-static const struct tx_table tx_table[] =
-{
-   [ MESA_FORMAT_NONE ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_A8B8G8R8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_R8G8B8A8_UNORM ] = { RADEON_TXFORMAT_RGBA8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B8G8R8A8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_BGR_UNORM8 ] = { RADEON_TXFORMAT_ARGB8888, 0 },
-   [ MESA_FORMAT_B5G6R5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_R5G6B5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
-   [ MESA_FORMAT_B4G4R4A4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A4R4G4B4_UNORM ] = { RADEON_TXFORMAT_ARGB4444 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_B5G5R5A1_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A1R5G5B5_UNORM ] = { RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L8A8_UNORM ] = { RADEON_TXFORMAT_AL88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A8L8_UNORM ] = { RADEON_TXFORMAT_AL88 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_A_UNORM8 ] = { RADEON_TXFORMAT_A8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_L_UNORM8 ] = { RADEON_TXFORMAT_L8, 0 },
-   [ MESA_FORMAT_I_UNORM8 ] = { RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_YCBCR ] = { RADEON_TXFORMAT_YCBCR, RADEON_YUV_TO_RGB },
-   [ MESA_FORMAT_YCBCR_REV ] = { RADEON_TXFORMAT_YCBCR_REV, RADEON_YUV_TO_RGB },
-   [ MESA_FORMAT_RGB_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGBA_FXT1 ] = { 0xffffffff, 0 },
-   [ MESA_FORMAT_RGB_DXT1 ] = { RADEON_TXFORMAT_RGB_DXT1, 0 },
-   [ MESA_FORMAT_RGBA_DXT1 ] = { RADEON_TXFORMAT_RGBA_DXT1 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT3 ] = { RADEON_TXFORMAT_RGBA_DXT3 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-   [ MESA_FORMAT_RGBA_DXT5 ] = { RADEON_TXFORMAT_RGBA_DXT5 | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
-};
-
 /* ================================================================
  * Texture combine functions
  */
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index edfd48b283b..4794ddae069 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -224,7 +224,19 @@ static mesa_format radeonChoose8888TexFormat(radeonContextPtr rmesa,
 	const GLuint ui = 1;
 	const GLubyte littleEndian = *((const GLubyte *)&ui);
 
-	if (fbo)
+
+	/* Unfortunately, regardless the fbo flag, we might still be asked to
+	 * attach a texture to a fbo later, which then won't succeed if we chose
+	 * one which isn't renderable. And unlike more exotic formats, apps aren't
+	 * really prepared for the incomplete framebuffer this results in (they'd
+	 * have to retry with same internalFormat even, just different
+	 * srcFormat/srcType, which can't really be expected anyway).
+	 * Ideally, we'd defer format selection until later (if the texture is
+	 * used as a rt it's likely there's never data uploaded to it before attached
+	 * to a fbo), but this isn't really possible, so for now just always use
+	 * a renderable format.
+	 */
+	if (1 || fbo)
 		return _radeon_texformat_argb8888;
 
 	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
@@ -267,8 +279,8 @@ mesa_format radeonChooseTextureFormat(struct gl_context * ctx,
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 		"%s InternalFormat=%s(%d) type=%s format=%s\n",
 		__func__,
-		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
-		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+		_mesa_enum_to_string(internalFormat), internalFormat,
+		_mesa_enum_to_string(type), _mesa_enum_to_string(format));
 	radeon_print(RADEON_TEXTURE, RADEON_TRACE,
 			"%s do32bpt=%d force16bpt=%d\n",
 			__func__, do32bpt, force16bpt);
@@ -531,7 +543,7 @@ void radeon_image_target_texture_2d(struct gl_context *ctx, GLenum target,
 	__DRIscreen *screen;
 	__DRIimage *image;
 
-	screen = radeon->dri.screen;
+	screen = radeon->radeonScreen->driScreen;
 	image = screen->dri2.image->lookupEGLImage(screen, image_handle,
 						   screen->loaderPrivate);
 	if (image == NULL)
diff --git a/src/mesa/drivers/dri/swrast/Makefile.am b/src/mesa/drivers/dri/swrast/Makefile.am
index bfc3c10e334..9d21d9ea4dc 100644
--- a/src/mesa/drivers/dri/swrast/Makefile.am
+++ b/src/mesa/drivers/dri/swrast/Makefile.am
@@ -24,7 +24,6 @@
 include Makefile.sources
 
 AM_CFLAGS = \
-	-D__NOT_HAVE_DRM_H \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/ \
 	-I$(top_srcdir)/src/mapi \
@@ -33,6 +32,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
+	$(LIBDRM_CFLAGS) \
 	$(DEFINES) \
 	$(VISIBILITY_CFLAGS)
 
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index 022523eb00b..5c7dcac3841 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -1124,7 +1124,7 @@ static struct name_function functions[] = {
    { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
    { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
    { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
-   { "OSMesaPixelsStore", (OSMESAproc) OSMesaPixelStore },
+   { "OSMesaPixelStore", (OSMESAproc) OSMesaPixelStore },
    { "OSMesaGetIntegerv", (OSMESAproc) OSMesaGetIntegerv },
    { "OSMesaGetDepthBuffer", (OSMESAproc) OSMesaGetDepthBuffer },
    { "OSMesaGetColorBuffer", (OSMESAproc) OSMesaGetColorBuffer },
diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index 9c2e29e6472..53c8fb893b5 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -69,6 +69,25 @@ check_valid_to_render(struct gl_context *ctx, const char *function)
          return false;
       }
 
+      /* The spec argues that this is allowed because a tess ctrl shader
+       * without a tess eval shader can be used with transform feedback.
+       * However, glBeginTransformFeedback doesn't allow GL_PATCHES and
+       * therefore doesn't allow tessellation.
+       *
+       * Further investigation showed that this is indeed a spec bug and
+       * a tess ctrl shader without a tess eval shader shouldn't have been
+       * allowed, because there is no API in GL 4.0 that can make use this
+       * to produce something useful.
+       *
+       * Also, all vendors except one don't support a tess ctrl shader without
+       * a tess eval shader anyway.
+       */
+      if (ctx->TessCtrlProgram._Current && !ctx->TessEvalProgram._Current) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(tess eval shader is missing)", function);
+         return false;
+      }
+
       /* Section 7.3 (Program Objects) of the OpenGL 4.5 Core Profile spec
        * says:
        *
@@ -127,6 +146,9 @@ _mesa_is_valid_prim_mode(struct gl_context *ctx, GLenum mode)
    if (mode <= GL_TRIANGLE_STRIP_ADJACENCY)
       return _mesa_has_geometry_shaders(ctx);
 
+   if (mode == GL_PATCHES)
+      return _mesa_has_tessellation(ctx);
+
    return false;
 }
 
@@ -136,6 +158,7 @@ _mesa_is_valid_prim_mode(struct gl_context *ctx, GLenum mode)
  * etc?  Also, do additional checking related to transformation feedback.
  * Note: this function cannot be called during glNewList(GL_COMPILE) because
  * this code depends on current transform feedback state.
+ * Also, do additional checking related to tessellation shaders.
  */
 GLboolean
 _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
@@ -170,11 +193,29 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
     *   TRIANGLES_ADJACENCY_ARB and <mode> is not
     *   TRIANGLES_ADJACENCY_ARB or TRIANGLE_STRIP_ADJACENCY_ARB.
     *
+    * The GL spec doesn't mention any interaction with tessellation, which
+    * is clearly a spec bug. The same rule should apply, but instead of
+    * the draw primitive mode, the tessellation evaluation shader primitive
+    * mode should be used for the checking.
    */
    if (ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]) {
       const GLenum geom_mode =
          ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY]->Geom.InputType;
-      switch (mode) {
+      struct gl_shader_program *tes =
+         ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+      GLenum mode_before_gs = mode;
+
+      if (tes) {
+         if (tes->TessEval.PointMode)
+            mode_before_gs = GL_POINTS;
+         else if (tes->TessEval.PrimitiveMode == GL_ISOLINES)
+            mode_before_gs = GL_LINES;
+         else
+            /* the GL_QUADS mode generates triangles too */
+            mode_before_gs = GL_TRIANGLES;
+      }
+
+      switch (mode_before_gs) {
       case GL_POINTS:
          valid_enum = (geom_mode == GL_POINTS);
          break;
@@ -209,12 +250,42 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(mode=%s vs geometry shader input %s)",
                      name,
-                     _mesa_lookup_prim_by_nr(mode),
+                     _mesa_lookup_prim_by_nr(mode_before_gs),
                      _mesa_lookup_prim_by_nr(geom_mode));
          return GL_FALSE;
       }
    }
 
+   /* From the OpenGL 4.0 (Core Profile) spec (section 2.12):
+    *
+    *     "Tessellation operates only on patch primitives. If tessellation is
+    *      active, any command that transfers vertices to the GL will
+    *      generate an INVALID_OPERATION error if the primitive mode is not
+    *      PATCHES.
+    *      Patch primitives are not supported by pipeline stages below the
+    *      tessellation evaluation shader. If there is no active program
+    *      object or the active program object does not contain a tessellation
+    *      evaluation shader, the error INVALID_OPERATION is generated by any
+    *      command that transfers vertices to the GL if the primitive mode is
+    *      PATCHES."
+    *
+    */
+   if (ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL] ||
+       ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL]) {
+      if (mode != GL_PATCHES) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "only GL_PATCHES valid with tessellation");
+         return GL_FALSE;
+      }
+   }
+   else {
+      if (mode == GL_PATCHES) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "GL_PATCHES only valid with tessellation");
+         return GL_FALSE;
+      }
+   }
+
    /* From the GL_EXT_transform_feedback spec:
     *
     *     "The error INVALID_OPERATION is generated if Begin, or any command
@@ -247,6 +318,17 @@ _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name)
             pass = GL_FALSE;
          }
       }
+      else if (ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL]) {
+         struct gl_shader_program *tes =
+            ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+
+         if (tes->TessEval.PointMode)
+            pass = ctx->TransformFeedback.Mode == GL_POINTS;
+         else if (tes->TessEval.PrimitiveMode == GL_ISOLINES)
+            pass = ctx->TransformFeedback.Mode == GL_LINES;
+         else
+            pass = ctx->TransformFeedback.Mode == GL_TRIANGLES;
+      }
       else {
          switch (mode) {
          case GL_POINTS:
@@ -291,7 +373,7 @@ valid_elements_type(struct gl_context *ctx, GLenum type, const char *name)
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(type = %s)", name,
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(type));
       return false;
    }
 }
diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c
index 9fc35520a38..935ba05b7cc 100644
--- a/src/mesa/main/atifragshader.c
+++ b/src/mesa/main/atifragshader.c
@@ -132,21 +132,21 @@ static void debug_op(GLint optype, GLuint arg_count, GLenum op, GLuint dst,
 
   op_name = atifs_ops[(arg_count-1)+(optype?3:0)];
   
-  fprintf(stderr, "%s(%s, %s", op_name, _mesa_lookup_enum_by_nr(op),
-	      _mesa_lookup_enum_by_nr(dst));
+  fprintf(stderr, "%s(%s, %s", op_name, _mesa_enum_to_string(op),
+	      _mesa_enum_to_string(dst));
   if (!optype)
     fprintf(stderr, ", %d", dstMask);
   
   fprintf(stderr, ", %s", create_dst_mod_str(dstMod));
   
-  fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg1),
-	      _mesa_lookup_enum_by_nr(arg1Rep), arg1Mod);
+  fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg1),
+	      _mesa_enum_to_string(arg1Rep), arg1Mod);
   if (arg_count>1)
-    fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg2),
-	      _mesa_lookup_enum_by_nr(arg2Rep), arg2Mod);
+    fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg2),
+	      _mesa_enum_to_string(arg2Rep), arg2Mod);
   if (arg_count>2)
-    fprintf(stderr, ", %s, %s, %d", _mesa_lookup_enum_by_nr(arg3),
-	      _mesa_lookup_enum_by_nr(arg3Rep), arg3Mod);
+    fprintf(stderr, ", %s, %s, %d", _mesa_enum_to_string(arg3),
+	      _mesa_enum_to_string(arg3Rep), arg3Mod);
 
   fprintf(stderr,")\n");
 
@@ -383,7 +383,7 @@ _mesa_EndFragmentShaderATI(void)
    for (j = 0; j < MAX_NUM_PASSES_ATI; j++) {
       for (i = 0; i < MAX_NUM_FRAGMENT_REGISTERS_ATI; i++) {
 	 GLuint op = curProg->SetupInst[j][i].Opcode;
-	 const char *op_enum = op > 5 ? _mesa_lookup_enum_by_nr(op) : "0";
+	 const char *op_enum = op > 5 ? _mesa_enum_to_string(op) : "0";
 	 GLuint src = curProg->SetupInst[j][i].src;
 	 GLuint swizzle = curProg->SetupInst[j][i].swizzle;
 	 fprintf(stderr, "%2d %04X %s %d %04X\n", i, op, op_enum, src,
@@ -392,8 +392,8 @@ _mesa_EndFragmentShaderATI(void)
       for (i = 0; i < curProg->numArithInstr[j]; i++) {
 	 GLuint op0 = curProg->Instructions[j][i].Opcode[0];
 	 GLuint op1 = curProg->Instructions[j][i].Opcode[1];
-	 const char *op0_enum = op0 > 5 ? _mesa_lookup_enum_by_nr(op0) : "0";
-	 const char *op1_enum = op1 > 5 ? _mesa_lookup_enum_by_nr(op1) : "0";
+	 const char *op0_enum = op0 > 5 ? _mesa_enum_to_string(op0) : "0";
+	 const char *op1_enum = op1 > 5 ? _mesa_enum_to_string(op1) : "0";
 	 GLuint count0 = curProg->Instructions[j][i].ArgCount[0];
 	 GLuint count1 = curProg->Instructions[j][i].ArgCount[1];
 	 fprintf(stderr, "%2d %04X %s %d %04X %s %d\n", i, op0, op0_enum, count0,
@@ -477,8 +477,8 @@ _mesa_PassTexCoordATI(GLuint dst, GLuint coord, GLenum swizzle)
 
 #if MESA_DEBUG_ATI_FS
    _mesa_debug(ctx, "%s(%s, %s, %s)\n", __func__,
-	       _mesa_lookup_enum_by_nr(dst), _mesa_lookup_enum_by_nr(coord),
-	       _mesa_lookup_enum_by_nr(swizzle));
+	       _mesa_enum_to_string(dst), _mesa_enum_to_string(coord),
+	       _mesa_enum_to_string(swizzle));
 #endif
 }
 
@@ -550,8 +550,8 @@ _mesa_SampleMapATI(GLuint dst, GLuint interp, GLenum swizzle)
 
 #if MESA_DEBUG_ATI_FS
    _mesa_debug(ctx, "%s(%s, %s, %s)\n", __func__,
-	       _mesa_lookup_enum_by_nr(dst), _mesa_lookup_enum_by_nr(interp),
-	       _mesa_lookup_enum_by_nr(swizzle));
+	       _mesa_enum_to_string(dst), _mesa_enum_to_string(interp),
+	       _mesa_enum_to_string(swizzle));
 #endif
 }
 
diff --git a/src/mesa/main/attrib.c b/src/mesa/main/attrib.c
index 53626e38be9..08f13178f84 100644
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -937,7 +937,7 @@ _mesa_PopAttrib(void)
 
       if (MESA_VERBOSE & VERBOSE_API) {
          _mesa_debug(ctx, "glPopAttrib %s\n",
-                     _mesa_lookup_enum_by_nr(attr->kind));
+                     _mesa_enum_to_string(attr->kind));
       }
 
       switch (attr->kind) {
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index d869fa2aa09..4fc32962425 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -128,28 +128,28 @@ validate_blend_factors(struct gl_context *ctx, const char *func,
    if (!legal_src_factor(ctx, sfactorRGB)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(sfactorRGB = %s)", func,
-                  _mesa_lookup_enum_by_nr(sfactorRGB));
+                  _mesa_enum_to_string(sfactorRGB));
       return GL_FALSE;
    }
 
    if (!legal_dst_factor(ctx, dfactorRGB)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(dfactorRGB = %s)", func,
-                  _mesa_lookup_enum_by_nr(dfactorRGB));
+                  _mesa_enum_to_string(dfactorRGB));
       return GL_FALSE;
    }
 
    if (sfactorA != sfactorRGB && !legal_src_factor(ctx, sfactorA)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(sfactorA = %s)", func,
-                  _mesa_lookup_enum_by_nr(sfactorA));
+                  _mesa_enum_to_string(sfactorA));
       return GL_FALSE;
    }
 
    if (dfactorA != dfactorRGB && !legal_dst_factor(ctx, dfactorA)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(dfactorA = %s)", func,
-                  _mesa_lookup_enum_by_nr(dfactorA));
+                  _mesa_enum_to_string(dfactorA));
       return GL_FALSE;
    }
 
@@ -208,10 +208,10 @@ _mesa_BlendFuncSeparate( GLenum sfactorRGB, GLenum dfactorRGB,
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendFuncSeparate %s %s %s %s\n",
-                  _mesa_lookup_enum_by_nr(sfactorRGB),
-                  _mesa_lookup_enum_by_nr(dfactorRGB),
-                  _mesa_lookup_enum_by_nr(sfactorA),
-                  _mesa_lookup_enum_by_nr(dfactorA));
+                  _mesa_enum_to_string(sfactorRGB),
+                  _mesa_enum_to_string(dfactorRGB),
+                  _mesa_enum_to_string(sfactorA),
+                  _mesa_enum_to_string(dfactorA));
 
    if (!validate_blend_factors(ctx, "glBlendFuncSeparate",
                                sfactorRGB, dfactorRGB,
@@ -342,7 +342,7 @@ _mesa_BlendEquation( GLenum mode )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquation(%s)\n",
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
 
    if (!legal_blend_equation(ctx, mode)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glBlendEquation");
@@ -385,7 +385,7 @@ _mesa_BlendEquationiARB(GLuint buf, GLenum mode)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationi(%u, %s)\n",
-                  buf, _mesa_lookup_enum_by_nr(mode));
+                  buf, _mesa_enum_to_string(mode));
 
    if (buf >= ctx->Const.MaxDrawBuffers) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBlendFuncSeparatei(buffer=%u)",
@@ -421,8 +421,8 @@ _mesa_BlendEquationSeparate( GLenum modeRGB, GLenum modeA )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationSeparateEXT(%s %s)\n",
-                  _mesa_lookup_enum_by_nr(modeRGB),
-                  _mesa_lookup_enum_by_nr(modeA));
+                  _mesa_enum_to_string(modeRGB),
+                  _mesa_enum_to_string(modeA));
 
    if ( (modeRGB != modeA) && !ctx->Extensions.EXT_blend_equation_separate ) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -476,8 +476,8 @@ _mesa_BlendEquationSeparateiARB(GLuint buf, GLenum modeRGB, GLenum modeA)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBlendEquationSeparatei(%u, %s %s)\n", buf,
-                  _mesa_lookup_enum_by_nr(modeRGB),
-                  _mesa_lookup_enum_by_nr(modeA));
+                  _mesa_enum_to_string(modeRGB),
+                  _mesa_enum_to_string(modeA));
 
    if (buf >= ctx->Const.MaxDrawBuffers) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBlendEquationSeparatei(buffer=%u)",
@@ -567,7 +567,10 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glAlphaFunc(%s, %f)\n",
-                  _mesa_lookup_enum_by_nr(func), ref);
+                  _mesa_enum_to_string(func), ref);
+
+   if (ctx->Color.AlphaFunc == func && ctx->Color.AlphaRefUnclamped == ref)
+      return; /* no change */
 
    switch (func) {
    case GL_NEVER:
@@ -578,9 +581,6 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref )
    case GL_NOTEQUAL:
    case GL_GEQUAL:
    case GL_ALWAYS:
-      if (ctx->Color.AlphaFunc == func && ctx->Color.AlphaRefUnclamped == ref)
-         return; /* no change */
-
       FLUSH_VERTICES(ctx, _NEW_COLOR);
       ctx->Color.AlphaFunc = func;
       ctx->Color.AlphaRefUnclamped = ref;
@@ -613,7 +613,7 @@ _mesa_LogicOp( GLenum opcode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glLogicOp(%s)\n", _mesa_lookup_enum_by_nr(opcode));
+      _mesa_debug(ctx, "glLogicOp(%s)\n", _mesa_enum_to_string(opcode));
 
    switch (opcode) {
       case GL_CLEAR:
@@ -790,7 +790,7 @@ _mesa_ClampColor(GLenum target, GLenum clamp)
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glClampColor(%s)",
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
 }
 
 static GLboolean
@@ -930,12 +930,10 @@ void _mesa_init_color( struct gl_context * ctx )
    ctx->Color._ClampFragmentColor = GL_FALSE;
    ctx->Color.ClampReadColor = GL_FIXED_ONLY_ARB;
 
-   if (ctx->API == API_OPENGLES2) {
-      /* GLES 3 behaves as though GL_FRAMEBUFFER_SRGB is always enabled. */
-      ctx->Color.sRGBEnabled = GL_TRUE;
-   } else {
-      ctx->Color.sRGBEnabled = GL_FALSE;
-   }
+   /* GLES 1/2/3 behaves as though GL_FRAMEBUFFER_SRGB is always enabled
+    * if EGL_KHR_gl_colorspace has been used to request sRGB.
+    */
+   ctx->Color.sRGBEnabled = _mesa_is_gles(ctx);
 }
 
 /*@}*/
diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c
index db8fee5a414..a32f1a42aea 100644
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -37,6 +37,7 @@
 #include "framebuffer.h"
 #include "glformats.h"
 #include "mtypes.h"
+#include "macros.h"
 #include "state.h"
 
 
@@ -58,6 +59,31 @@ find_attachment(const struct gl_framebuffer *fb,
 }
 
 
+/**
+ * \return true if two regions overlap, false otherwise
+ */
+bool
+_mesa_regions_overlap(int srcX0, int srcY0,
+                      int srcX1, int srcY1,
+                      int dstX0, int dstY0,
+                      int dstX1, int dstY1)
+{
+   if (MAX2(srcX0, srcX1) < MIN2(dstX0, dstX1))
+      return false; /* dst completely right of src */
+
+   if (MAX2(dstX0, dstX1) < MIN2(srcX0, srcX1))
+      return false; /* dst completely left of src */
+
+   if (MAX2(srcY0, srcY1) < MIN2(dstY0, dstY1))
+      return false; /* dst completely above src */
+
+   if (MAX2(dstY0, dstY1) < MIN2(srcY0, srcY1))
+      return false; /* dst completely below src */
+
+   return true; /* some overlap */
+}
+
+
 /**
  * Helper function for checking if the datatypes of color buffers are
  * compatible for glBlitFramebuffer.  From the 3.1 spec, page 198:
@@ -186,7 +212,7 @@ _mesa_blit_framebuffer(struct gl_context *ctx,
 
    if (!is_valid_blit_filter(ctx, filter)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid filter %s)", func,
-                  _mesa_lookup_enum_by_nr(filter));
+                  _mesa_enum_to_string(filter));
       return;
    }
 
@@ -194,7 +220,7 @@ _mesa_blit_framebuffer(struct gl_context *ctx,
         filter == GL_SCALED_RESOLVE_NICEST_EXT) &&
         (readFb->Visual.samples == 0 || drawFb->Visual.samples > 0)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s: invalid samples)", func,
-                  _mesa_lookup_enum_by_nr(filter));
+                  _mesa_enum_to_string(filter));
       return;
    }
 
@@ -522,7 +548,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                   " %d, %d, %d, %d, 0x%x, %s)\n",
                   srcX0, srcY0, srcX1, srcY1,
                   dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_lookup_enum_by_nr(filter));
+                  mask, _mesa_enum_to_string(filter));
 
    _mesa_blit_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
                           srcX0, srcY0, srcX1, srcY1,
@@ -547,7 +573,7 @@ _mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
                   readFramebuffer, drawFramebuffer,
                   srcX0, srcY0, srcX1, srcY1,
                   dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_lookup_enum_by_nr(filter));
+                  mask, _mesa_enum_to_string(filter));
 
    /*
     * According to PDF page 533 of the OpenGL 4.5 core spec (30.10.2014,
diff --git a/src/mesa/main/blit.h b/src/mesa/main/blit.h
index 54b946e3192..88dd4a9ec8d 100644
--- a/src/mesa/main/blit.h
+++ b/src/mesa/main/blit.h
@@ -28,6 +28,12 @@
 
 #include "glheader.h"
 
+extern bool
+_mesa_regions_overlap(int srcX0, int srcY0,
+                      int srcX1, int srcY1,
+                      int dstX0, int dstY0,
+                      int dstX1, int dstY1);
+
 extern void
 _mesa_blit_framebuffer(struct gl_context *ctx,
                        struct gl_framebuffer *readFb,
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 66dee680258..1cdea937f91 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -91,8 +91,9 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
    case GL_COPY_WRITE_BUFFER:
       return &ctx->CopyWriteBuffer;
    case GL_DRAW_INDIRECT_BUFFER:
-      if (ctx->API == API_OPENGL_CORE &&
-          ctx->Extensions.ARB_draw_indirect) {
+      if ((ctx->API == API_OPENGL_CORE &&
+           ctx->Extensions.ARB_draw_indirect) ||
+           _mesa_is_gles31(ctx)) {
          return &ctx->DrawIndirectBuffer;
       }
       break;
@@ -112,6 +113,11 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
          return &ctx->UniformBuffer;
       }
       break;
+   case GL_SHADER_STORAGE_BUFFER:
+      if (ctx->Extensions.ARB_shader_storage_buffer_object) {
+         return &ctx->ShaderStorageBuffer;
+      }
+      break;
    case GL_ATOMIC_COUNTER_BUFFER:
       if (ctx->Extensions.ARB_shader_atomic_counters) {
          return &ctx->AtomicBuffer;
@@ -831,6 +837,9 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
    _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer,
 				 ctx->Shared->NullBufferObj);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer,
+                                 ctx->Shared->NullBufferObj);
+
    _mesa_reference_buffer_object(ctx, &ctx->AtomicBuffer,
 				 ctx->Shared->NullBufferObj);
 
@@ -845,6 +854,14 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
       ctx->UniformBufferBindings[i].Size = -1;
    }
 
+   for (i = 0; i < MAX_COMBINED_SHADER_STORAGE_BUFFERS; i++) {
+      _mesa_reference_buffer_object(ctx,
+                                    &ctx->ShaderStorageBufferBindings[i].BufferObject,
+                                    ctx->Shared->NullBufferObj);
+      ctx->ShaderStorageBufferBindings[i].Offset = -1;
+      ctx->ShaderStorageBufferBindings[i].Size = -1;
+   }
+
    for (i = 0; i < MAX_COMBINED_ATOMIC_BUFFERS; i++) {
       _mesa_reference_buffer_object(ctx,
 				    &ctx->AtomicBufferBindings[i].BufferObject,
@@ -867,6 +884,8 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 
    _mesa_reference_buffer_object(ctx, &ctx->UniformBuffer, NULL);
 
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, NULL);
+
    _mesa_reference_buffer_object(ctx, &ctx->AtomicBuffer, NULL);
 
    _mesa_reference_buffer_object(ctx, &ctx->DrawIndirectBuffer, NULL);
@@ -877,6 +896,12 @@ _mesa_free_buffer_objects( struct gl_context *ctx )
 				    NULL);
    }
 
+   for (i = 0; i < MAX_COMBINED_SHADER_STORAGE_BUFFERS; i++) {
+      _mesa_reference_buffer_object(ctx,
+                                    &ctx->ShaderStorageBufferBindings[i].BufferObject,
+                                    NULL);
+   }
+
    for (i = 0; i < MAX_COMBINED_ATOMIC_BUFFERS; i++) {
       _mesa_reference_buffer_object(ctx,
 				    &ctx->AtomicBufferBindings[i].BufferObject,
@@ -1158,7 +1183,7 @@ _mesa_BindBuffer(GLenum target, GLuint buffer)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBindBuffer(%s, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), buffer);
+                  _mesa_enum_to_string(target), buffer);
 
    bind_buffer_object(ctx, target, buffer);
 }
@@ -1240,6 +1265,17 @@ _mesa_DeleteBuffers(GLsizei n, const GLuint *ids)
             _mesa_BindBuffer( GL_UNIFORM_BUFFER, 0 );
          }
 
+         /* unbind SSBO binding points */
+         for (j = 0; j < ctx->Const.MaxShaderStorageBufferBindings; j++) {
+            if (ctx->ShaderStorageBufferBindings[j].BufferObject == bufObj) {
+               _mesa_BindBufferBase(GL_SHADER_STORAGE_BUFFER, j, 0);
+            }
+         }
+
+         if (ctx->ShaderStorageBuffer == bufObj) {
+            _mesa_BindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+         }
+
          /* unbind Atomci Buffer binding points */
          for (j = 0; j < ctx->Const.MaxAtomicBufferBindings; j++) {
             if (ctx->AtomicBufferBindings[j].BufferObject == bufObj) {
@@ -1500,9 +1536,9 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "%s(%s, %ld, %p, %s)\n",
                   func,
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   (long int) size, data,
-                  _mesa_lookup_enum_by_nr(usage));
+                  _mesa_enum_to_string(usage));
 
    if (size < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(size < 0)", func);
@@ -1535,7 +1571,7 @@ _mesa_buffer_data(struct gl_context *ctx, struct gl_buffer_object *bufObj,
 
    if (!valid_usage) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid usage: %s)", func,
-                  _mesa_lookup_enum_by_nr(usage));
+                  _mesa_enum_to_string(usage));
       return;
    }
 
@@ -1990,7 +2026,7 @@ get_buffer_parameter(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname: %s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return false;
 }
 
@@ -2337,7 +2373,7 @@ _mesa_map_buffer_range(struct gl_context *ctx,
 
    if (offset + length > bufObj->Size) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(offset %ld + length %ld > buffer_size %ld)", func,
+                  "%s(offset %td + length %td > buffer_size %td)", func,
                   offset, length, bufObj->Size);
       return NULL;
    }
@@ -2998,6 +3034,33 @@ set_ubo_binding(struct gl_context *ctx,
       bufObj->UsageHistory |= USAGE_UNIFORM_BUFFER;
 }
 
+/**
+ * Binds a buffer object to a shader storage buffer binding point.
+ *
+ * The caller is responsible for flushing vertices and updating
+ * NewDriverState.
+ */
+static void
+set_ssbo_binding(struct gl_context *ctx,
+                 struct gl_shader_storage_buffer_binding *binding,
+                 struct gl_buffer_object *bufObj,
+                 GLintptr offset,
+                 GLsizeiptr size,
+                 GLboolean autoSize)
+{
+   _mesa_reference_buffer_object(ctx, &binding->BufferObject, bufObj);
+
+   binding->Offset = offset;
+   binding->Size = size;
+   binding->AutomaticSize = autoSize;
+
+   /* If this is a real buffer object, mark it has having been used
+    * at some point as a SSBO.
+    */
+   if (size >= 0)
+      bufObj->UsageHistory |= USAGE_SHADER_STORAGE_BUFFER;
+}
+
 /**
  * Binds a buffer object to a uniform buffer binding point.
  *
@@ -3029,6 +3092,37 @@ bind_uniform_buffer(struct gl_context *ctx,
    set_ubo_binding(ctx, binding, bufObj, offset, size, autoSize);
 }
 
+/**
+ * Binds a buffer object to a shader storage buffer binding point.
+ *
+ * Unlike set_ssbo_binding(), this function also flushes vertices
+ * and updates NewDriverState.  It also checks if the binding
+ * has actually changed before updating it.
+ */
+static void
+bind_shader_storage_buffer(struct gl_context *ctx,
+                           GLuint index,
+                           struct gl_buffer_object *bufObj,
+                           GLintptr offset,
+                           GLsizeiptr size,
+                           GLboolean autoSize)
+{
+   struct gl_shader_storage_buffer_binding *binding =
+      &ctx->ShaderStorageBufferBindings[index];
+
+   if (binding->BufferObject == bufObj &&
+       binding->Offset == offset &&
+       binding->Size == size &&
+       binding->AutomaticSize == autoSize) {
+      return;
+   }
+
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   set_ssbo_binding(ctx, binding, bufObj, offset, size, autoSize);
+}
+
 /**
  * Bind a region of a buffer object to a uniform block binding point.
  * \param index  the uniform buffer binding point index
@@ -3064,6 +3158,40 @@ bind_buffer_range_uniform_buffer(struct gl_context *ctx,
    bind_uniform_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
 }
 
+/**
+ * Bind a region of a buffer object to a shader storage block binding point.
+ * \param index  the shader storage buffer binding point index
+ * \param bufObj  the buffer object
+ * \param offset  offset to the start of buffer object region
+ * \param size  size of the buffer object region
+ */
+static void
+bind_buffer_range_shader_storage_buffer(struct gl_context *ctx,
+                                        GLuint index,
+                                        struct gl_buffer_object *bufObj,
+                                        GLintptr offset,
+                                        GLsizeiptr size)
+{
+   if (index >= ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferRange(index=%d)", index);
+      return;
+   }
+
+   if (offset & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glBindBufferRange(offset misaligned %d/%d)", (int) offset,
+                  ctx->Const.ShaderStorageBufferOffsetAlignment);
+      return;
+   }
+
+   if (bufObj == ctx->Shared->NullBufferObj) {
+      offset = -1;
+      size = -1;
+   }
+
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
+   bind_shader_storage_buffer(ctx, index, bufObj, offset, size, GL_FALSE);
+}
 
 /**
  * Bind a buffer object to a uniform block binding point.
@@ -3087,6 +3215,28 @@ bind_buffer_base_uniform_buffer(struct gl_context *ctx,
       bind_uniform_buffer(ctx, index, bufObj, 0, 0, GL_TRUE);
 }
 
+/**
+ * Bind a buffer object to a shader storage block binding point.
+ * As above, but offset = 0.
+ */
+static void
+bind_buffer_base_shader_storage_buffer(struct gl_context *ctx,
+                                       GLuint index,
+                                       struct gl_buffer_object *bufObj)
+{
+   if (index >= ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glBindBufferBase(index=%d)", index);
+      return;
+   }
+
+   _mesa_reference_buffer_object(ctx, &ctx->ShaderStorageBuffer, bufObj);
+
+   if (bufObj == ctx->Shared->NullBufferObj)
+      bind_shader_storage_buffer(ctx, index, bufObj, -1, -1, GL_TRUE);
+   else
+      bind_shader_storage_buffer(ctx, index, bufObj, 0, 0, GL_TRUE);
+}
+
 /**
  * Binds a buffer object to an atomic buffer binding point.
  *
@@ -3219,6 +3369,35 @@ error_check_bind_uniform_buffers(struct gl_context *ctx,
    return true;
 }
 
+static bool
+error_check_bind_shader_storage_buffers(struct gl_context *ctx,
+                                        GLuint first, GLsizei count,
+                                        const char *caller)
+{
+   if (!ctx->Extensions.ARB_shader_storage_buffer_object) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "%s(target=GL_SHADER_STORAGE_BUFFER)", caller);
+      return false;
+   }
+
+   /* The ARB_multi_bind_spec says:
+    *
+    *     "An INVALID_OPERATION error is generated if <first> + <count> is
+    *      greater than the number of target-specific indexed binding points,
+    *      as described in section 6.7.1."
+    */
+   if (first + count > ctx->Const.MaxShaderStorageBufferBindings) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(first=%u + count=%d > the value of "
+                  "GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS=%u)",
+                  caller, first, count,
+                  ctx->Const.MaxShaderStorageBufferBindings);
+      return false;
+   }
+
+   return true;
+}
+
 /**
  * Unbind all uniform buffers in the range
  * <first> through <first>+<count>-1
@@ -3234,6 +3413,22 @@ unbind_uniform_buffers(struct gl_context *ctx, GLuint first, GLsizei count)
                       bufObj, -1, -1, GL_TRUE);
 }
 
+/**
+ * Unbind all shader storage buffers in the range
+ * <first> through <first>+<count>-1
+ */
+static void
+unbind_shader_storage_buffers(struct gl_context *ctx, GLuint first,
+                              GLsizei count)
+{
+   struct gl_buffer_object *bufObj = ctx->Shared->NullBufferObj;
+   GLint i;
+
+   for (i = 0; i < count; i++)
+      set_ssbo_binding(ctx, &ctx->ShaderStorageBufferBindings[first + i],
+                       bufObj, -1, -1, GL_TRUE);
+}
+
 static void
 bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
                           const GLuint *buffers)
@@ -3300,6 +3495,73 @@ bind_uniform_buffers_base(struct gl_context *ctx, GLuint first, GLsizei count,
    _mesa_end_bufferobj_lookups(ctx);
 }
 
+static void
+bind_shader_storage_buffers_base(struct gl_context *ctx, GLuint first,
+                                 GLsizei count, const GLuint *buffers)
+{
+   GLint i;
+
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
+                                                "glBindBuffersBase"))
+      return;
+
+   /* Assume that at least one binding will be changed */
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   if (!buffers) {
+      /* The ARB_multi_bind spec says:
+       *
+       *   "If <buffers> is NULL, all bindings from <first> through
+       *    <first>+<count>-1 are reset to their unbound (zero) state."
+       */
+      unbind_shader_storage_buffers(ctx, first, count);
+      return;
+   }
+
+   /* Note that the error semantics for multi-bind commands differ from
+    * those of other GL commands.
+    *
+    * The Issues section in the ARB_multi_bind spec says:
+    *
+    *    "(11) Typically, OpenGL specifies that if an error is generated by a
+    *          command, that command has no effect.  This is somewhat
+    *          unfortunate for multi-bind commands, because it would require a
+    *          first pass to scan the entire list of bound objects for errors
+    *          and then a second pass to actually perform the bindings.
+    *          Should we have different error semantics?
+    *
+    *       RESOLVED:  Yes.  In this specification, when the parameters for
+    *       one of the <count> binding points are invalid, that binding point
+    *       is not updated and an error will be generated.  However, other
+    *       binding points in the same command will be updated if their
+    *       parameters are valid and no other error occurs."
+    */
+
+   _mesa_begin_bufferobj_lookups(ctx);
+
+   for (i = 0; i < count; i++) {
+      struct gl_shader_storage_buffer_binding *binding =
+          &ctx->ShaderStorageBufferBindings[first + i];
+      struct gl_buffer_object *bufObj;
+
+      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
+         bufObj = binding->BufferObject;
+      else
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
+                                                    "glBindBuffersBase");
+
+      if (bufObj) {
+         if (bufObj == ctx->Shared->NullBufferObj)
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_TRUE);
+         else
+            set_ssbo_binding(ctx, binding, bufObj, 0, 0, GL_TRUE);
+      }
+   }
+
+   _mesa_end_bufferobj_lookups(ctx);
+}
+
 static void
 bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
                            const GLuint *buffers,
@@ -3405,6 +3667,112 @@ bind_uniform_buffers_range(struct gl_context *ctx, GLuint first, GLsizei count,
    _mesa_end_bufferobj_lookups(ctx);
 }
 
+static void
+bind_shader_storage_buffers_range(struct gl_context *ctx, GLuint first,
+                                  GLsizei count, const GLuint *buffers,
+                                  const GLintptr *offsets,
+                                  const GLsizeiptr *sizes)
+{
+   GLint i;
+
+   if (!error_check_bind_shader_storage_buffers(ctx, first, count,
+                                                "glBindBuffersRange"))
+      return;
+
+   /* Assume that at least one binding will be changed */
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
+
+   if (!buffers) {
+      /* The ARB_multi_bind spec says:
+       *
+       *    "If <buffers> is NULL, all bindings from <first> through
+       *     <first>+<count>-1 are reset to their unbound (zero) state.
+       *     In this case, the offsets and sizes associated with the
+       *     binding points are set to default values, ignoring
+       *     <offsets> and <sizes>."
+       */
+      unbind_shader_storage_buffers(ctx, first, count);
+      return;
+   }
+
+   /* Note that the error semantics for multi-bind commands differ from
+    * those of other GL commands.
+    *
+    * The Issues section in the ARB_multi_bind spec says:
+    *
+    *    "(11) Typically, OpenGL specifies that if an error is generated by a
+    *          command, that command has no effect.  This is somewhat
+    *          unfortunate for multi-bind commands, because it would require a
+    *          first pass to scan the entire list of bound objects for errors
+    *          and then a second pass to actually perform the bindings.
+    *          Should we have different error semantics?
+    *
+    *       RESOLVED:  Yes.  In this specification, when the parameters for
+    *       one of the <count> binding points are invalid, that binding point
+    *       is not updated and an error will be generated.  However, other
+    *       binding points in the same command will be updated if their
+    *       parameters are valid and no other error occurs."
+    */
+
+   _mesa_begin_bufferobj_lookups(ctx);
+
+   for (i = 0; i < count; i++) {
+      struct gl_shader_storage_buffer_binding *binding =
+         &ctx->ShaderStorageBufferBindings[first + i];
+      struct gl_buffer_object *bufObj;
+
+      if (!bind_buffers_check_offset_and_size(ctx, i, offsets, sizes))
+         continue;
+
+      /* The ARB_multi_bind spec says:
+       *
+       *     "An INVALID_VALUE error is generated by BindBuffersRange if any
+       *      pair of values in <offsets> and <sizes> does not respectively
+       *      satisfy the constraints described for those parameters for the
+       *      specified target, as described in section 6.7.1 (per binding)."
+       *
+       * Section 6.7.1 refers to table 6.5, which says:
+       *
+       *     "┌───────────────────────────────────────────────────────────────┐
+       *      │ Shader storage buffer array bindings (see sec. 7.8)           │
+       *      ├─────────────────────┬─────────────────────────────────────────┤
+       *      │  ...                │  ...                                    │
+       *      │  offset restriction │  multiple of value of SHADER_STORAGE_-  │
+       *      │                     │  BUFFER_OFFSET_ALIGNMENT                │
+       *      │  ...                │  ...                                    │
+       *      │  size restriction   │  none                                   │
+       *      └─────────────────────┴─────────────────────────────────────────┘"
+       */
+      if (offsets[i] & (ctx->Const.ShaderStorageBufferOffsetAlignment - 1)) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glBindBuffersRange(offsets[%u]=%" PRId64
+                     " is misaligned; it must be a multiple of the value of "
+                     "GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT=%u when "
+                     "target=GL_SHADER_STORAGE_BUFFER)",
+                     i, (int64_t) offsets[i],
+                     ctx->Const.ShaderStorageBufferOffsetAlignment);
+         continue;
+      }
+
+      if (binding->BufferObject && binding->BufferObject->Name == buffers[i])
+         bufObj = binding->BufferObject;
+      else
+         bufObj = _mesa_multi_bind_lookup_bufferobj(ctx, buffers, i,
+                                                    "glBindBuffersRange");
+
+      if (bufObj) {
+         if (bufObj == ctx->Shared->NullBufferObj)
+            set_ssbo_binding(ctx, binding, bufObj, -1, -1, GL_FALSE);
+         else
+            set_ssbo_binding(ctx, binding, bufObj,
+                             offsets[i], sizes[i], GL_FALSE);
+      }
+   }
+
+   _mesa_end_bufferobj_lookups(ctx);
+}
+
 static bool
 error_check_bind_xfb_buffers(struct gl_context *ctx,
                              struct gl_transform_feedback_object *tfObj,
@@ -3894,6 +4262,9 @@ _mesa_BindBufferRange(GLenum target, GLuint index,
    case GL_UNIFORM_BUFFER:
       bind_buffer_range_uniform_buffer(ctx, index, bufObj, offset, size);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_buffer_range_shader_storage_buffer(ctx, index, bufObj, offset, size);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffer(ctx, index, bufObj, offset, size,
                          "glBindBufferRange");
@@ -3960,6 +4331,9 @@ _mesa_BindBufferBase(GLenum target, GLuint index, GLuint buffer)
    case GL_UNIFORM_BUFFER:
       bind_buffer_base_uniform_buffer(ctx, index, bufObj);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_buffer_base_shader_storage_buffer(ctx, index, bufObj);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffer(ctx, index, bufObj, 0, 0,
                          "glBindBufferBase");
@@ -3984,13 +4358,17 @@ _mesa_BindBuffersRange(GLenum target, GLuint first, GLsizei count,
    case GL_UNIFORM_BUFFER:
       bind_uniform_buffers_range(ctx, first, count, buffers, offsets, sizes);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_shader_storage_buffers_range(ctx, first, count, buffers, offsets,
+                                        sizes);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffers_range(ctx, first, count, buffers,
                                 offsets, sizes);
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersRange(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       break;
    }
 }
@@ -4008,12 +4386,15 @@ _mesa_BindBuffersBase(GLenum target, GLuint first, GLsizei count,
    case GL_UNIFORM_BUFFER:
       bind_uniform_buffers_base(ctx, first, count, buffers);
       return;
+   case GL_SHADER_STORAGE_BUFFER:
+      bind_shader_storage_buffers_base(ctx, first, count, buffers);
+      return;
    case GL_ATOMIC_COUNTER_BUFFER:
       bind_atomic_buffers_base(ctx, first, count, buffers);
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBindBuffersBase(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       break;
    }
 }
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 0536266d756..93588a2ee18 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -251,7 +251,7 @@ _mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API) {
-      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_enum_to_string(buffer));
    }
 
    if (buffer == GL_NONE) {
@@ -264,14 +264,14 @@ _mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
       if (destMask == BAD_MASK) {
          /* totally bogus buffer */
          _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
       destMask &= supportedMask;
       if (destMask == 0x0) {
          /* none of the named color buffers exist! */
          _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid buffer %s)",
-                     caller, _mesa_lookup_enum_by_nr(buffer));
+                     caller, _mesa_enum_to_string(buffer));
          return;
       }
    }
@@ -411,7 +411,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
           */
          if (destMask[output] == BAD_MASK) {
             _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -427,7 +427,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
           */
          if (_mesa_bitcount(destMask[output]) > 1) {
             _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -445,7 +445,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
          if (destMask[output] == 0) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(unsupported buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -459,7 +459,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
              buffers[output] != GL_COLOR_ATTACHMENT0 + output) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(unsupported buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -471,7 +471,7 @@ _mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
          if (destMask[output] & usedBufferMask) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(duplicated buffer %s)",
-                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
+                        caller, _mesa_enum_to_string(buffers[output]));
             return;
          }
 
@@ -700,7 +700,7 @@ _mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_enum_to_string(buffer));
 
    if (buffer == GL_NONE) {
       /* This is legal--it means that no buffer should be bound for reading. */
@@ -712,14 +712,14 @@ _mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
       if (srcBuffer == -1) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
       supportedMask = supported_buffer_bitmask(ctx, fb);
       if (((1 << srcBuffer) & supportedMask) == 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "%s(invalid buffer %s)", caller,
-                     _mesa_lookup_enum_by_nr(buffer));
+                     _mesa_enum_to_string(buffer));
          return;
       }
    }
diff --git a/src/mesa/main/clear.c b/src/mesa/main/clear.c
index 426caea4709..3bfcc5c0e39 100644
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -325,6 +325,18 @@ _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
       _mesa_update_state( ctx );
    }
 
+   /* Page 498 of the PDF, section '17.4.3.1 Clearing Individual Buffers'
+    * of the OpenGL 4.5 spec states:
+    *
+    *    "An INVALID_ENUM error is generated by ClearBufferiv and
+    *     ClearNamedFramebufferiv if buffer is not COLOR or STENCIL."
+    */
+   if (buffer == GL_DEPTH || buffer == GL_DEPTH_STENCIL) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glClearBufferiv(buffer=GL_DEPTH || GL_DEPTH_STENCIL)");
+      return;
+   }
+
    switch (buffer) {
    case GL_STENCIL:
       /* Page 264 (page 280 of the PDF) of the OpenGL 3.0 spec says:
@@ -395,7 +407,7 @@ _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferiv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -485,7 +497,7 @@ _mesa_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferuiv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -596,7 +608,7 @@ _mesa_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value)
       return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferfv(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 }
@@ -636,7 +648,7 @@ _mesa_ClearBufferfi(GLenum buffer, GLint drawbuffer,
 
    if (buffer != GL_DEPTH_STENCIL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glClearBufferfi(buffer=%s)",
-                  _mesa_lookup_enum_by_nr(buffer));
+                  _mesa_enum_to_string(buffer));
       return;
    }
 
diff --git a/src/mesa/main/condrender.c b/src/mesa/main/condrender.c
index 77e4b95ee8f..46c6036d2a5 100644
--- a/src/mesa/main/condrender.c
+++ b/src/mesa/main/condrender.c
@@ -87,7 +87,7 @@ _mesa_BeginConditionalRender(GLuint queryId, GLenum mode)
       /* fallthrough - invalid */
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glBeginConditionalRender(mode=%s)",
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
       return;
    }
 
@@ -184,7 +184,7 @@ _mesa_check_conditional_render(struct gl_context *ctx)
    default:
       _mesa_problem(ctx, "Bad cond render mode %s in "
                     " _mesa_check_conditional_render()",
-                    _mesa_lookup_enum_by_nr(ctx->Query.CondRenderMode));
+                    _mesa_enum_to_string(ctx->Query.CondRenderMode));
       return GL_TRUE;
    }
 }
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 9c3baf4c6aa..b35031db3c9 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -171,8 +171,10 @@
 #define MAX_PROGRAM_LOCAL_PARAMS       4096
 #define MAX_UNIFORMS                   4096
 #define MAX_UNIFORM_BUFFERS            15 /* + 1 default uniform buffer */
+#define MAX_SHADER_STORAGE_BUFFERS     7  /* + 1 default shader storage buffer */
 /* 6 is for vertex, hull, domain, geometry, fragment, and compute shader. */
 #define MAX_COMBINED_UNIFORM_BUFFERS   (MAX_UNIFORM_BUFFERS * 6)
+#define MAX_COMBINED_SHADER_STORAGE_BUFFERS   (MAX_SHADER_STORAGE_BUFFERS * 6)
 #define MAX_ATOMIC_COUNTERS            4096
 /* 6 is for vertex, hull, domain, geometry, fragment, and compute shader. */
 #define MAX_COMBINED_ATOMIC_BUFFERS    (MAX_UNIFORM_BUFFERS * 6)
@@ -272,6 +274,12 @@
 #define MAX_VERTEX_STREAMS                  4
 /*@}*/
 
+/** For GL_ARB_shader_subroutine */
+/*@{*/
+#define MAX_SUBROUTINES                   256
+#define MAX_SUBROUTINE_UNIFORM_LOCATIONS  1024
+/*@}*/
+
 /** For GL_INTEL_performance_query */
 /*@{*/
 #define MAX_PERFQUERY_QUERY_NAME_LENGTH     256
@@ -294,6 +302,14 @@
 /** For GL_ARB_pipeline_statistics_query */
 #define MAX_PIPELINE_STATISTICS             11
 
+/** For GL_ARB_tessellation_shader */
+/*@{*/
+#define MAX_TESS_GEN_LEVEL 64
+#define MAX_PATCH_VERTICES 32
+#define MAX_TESS_PATCH_COMPONENTS 120
+#define MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS 4096
+/*@}*/
+
 /*
  * Color channel component order
  * 
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 79fa01849e0..888c461d1c2 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -120,6 +120,7 @@
 #include "shaderobj.h"
 #include "shaderimage.h"
 #include "util/simple_list.h"
+#include "util/strtod.h"
 #include "state.h"
 #include "stencil.h"
 #include "texcompress_s3tc.h"
@@ -337,31 +338,6 @@ _mesa_destroy_visual( struct gl_config *vis )
 /*@{*/
 
 
-/**
- * This is lame.  gdb only seems to recognize enum types that are
- * actually used somewhere.  We want to be able to print/use enum
- * values such as TEXTURE_2D_INDEX in gdb.  But we don't actually use
- * the gl_texture_index type anywhere.  Thus, this lame function.
- */
-static void
-dummy_enum_func(void)
-{
-   gl_buffer_index bi = BUFFER_FRONT_LEFT;
-   gl_face_index fi = FACE_POS_X;
-   gl_frag_result fr = FRAG_RESULT_DEPTH;
-   gl_texture_index ti = TEXTURE_2D_ARRAY_INDEX;
-   gl_vert_attrib va = VERT_ATTRIB_POS;
-   gl_varying_slot vs = VARYING_SLOT_POS;
-
-   (void) bi;
-   (void) fi;
-   (void) fr;
-   (void) ti;
-   (void) va;
-   (void) vs;
-}
-
-
 /**
  * One-time initialization mutex lock.
  *
@@ -370,6 +346,16 @@ dummy_enum_func(void)
 mtx_t OneTimeLock = _MTX_INITIALIZER_NP;
 
 
+/**
+ * Calls all the various one-time-fini functions in Mesa
+ */
+
+static void
+one_time_fini(void)
+{
+   _mesa_destroy_shader_compiler();
+   _mesa_locale_fini();
+}
 
 /**
  * Calls all the various one-time-init functions in Mesa.
@@ -391,13 +377,14 @@ one_time_init( struct gl_context *ctx )
    if (!api_init_mask) {
       GLuint i;
 
-      /* do some implementation tests */
-      assert( sizeof(GLbyte) == 1 );
-      assert( sizeof(GLubyte) == 1 );
-      assert( sizeof(GLshort) == 2 );
-      assert( sizeof(GLushort) == 2 );
-      assert( sizeof(GLint) == 4 );
-      assert( sizeof(GLuint) == 4 );
+      STATIC_ASSERT(sizeof(GLbyte) == 1);
+      STATIC_ASSERT(sizeof(GLubyte) == 1);
+      STATIC_ASSERT(sizeof(GLshort) == 2);
+      STATIC_ASSERT(sizeof(GLushort) == 2);
+      STATIC_ASSERT(sizeof(GLint) == 4);
+      STATIC_ASSERT(sizeof(GLuint) == 4);
+
+      _mesa_locale_init();
 
       _mesa_one_time_init_extension_overrides();
 
@@ -407,6 +394,8 @@ one_time_init( struct gl_context *ctx )
          _mesa_ubyte_to_float_color_tab[i] = (float) i / 255.0F;
       }
 
+      atexit(one_time_fini);
+
 #if defined(DEBUG) && defined(__DATE__) && defined(__TIME__)
       if (MESA_VERBOSE != 0) {
 	 _mesa_debug(ctx, "Mesa %s DEBUG build %s %s\n",
@@ -429,13 +418,6 @@ one_time_init( struct gl_context *ctx )
    api_init_mask |= 1 << ctx->API;
 
    mtx_unlock(&OneTimeLock);
-
-   /* Hopefully atexit() is widely available.  If not, we may need some
-    * #ifdef tests here.
-    */
-   atexit(_mesa_destroy_shader_compiler);
-
-   dummy_enum_func();
 }
 
 
@@ -496,6 +478,8 @@ init_program_limits(struct gl_constants *consts, gl_shader_stage stage,
       prog->MaxInputComponents = 16 * 4; /* old limit not to break tnl and swrast */
       prog->MaxOutputComponents = 0; /* value not used */
       break;
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       prog->MaxParameters = MAX_VERTEX_PROGRAM_PARAMS;
       prog->MaxAttribs = MAX_VERTEX_GENERIC_ATTRIBS;
@@ -554,6 +538,8 @@ init_program_limits(struct gl_constants *consts, gl_shader_stage stage,
 
    prog->MaxAtomicBuffers = 0;
    prog->MaxAtomicCounters = 0;
+
+   prog->MaxShaderStorageBlocks = 8;
 }
 
 
@@ -615,6 +601,12 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api)
    consts->MaxUniformBlockSize = 16384;
    consts->UniformBufferOffsetAlignment = 1;
 
+   /** GL_ARB_shader_storage_buffer_object */
+   consts->MaxCombinedShaderStorageBlocks = 8;
+   consts->MaxShaderStorageBufferBindings = 8;
+   consts->MaxShaderStorageBlockSize = 128 * 1024 * 1024; /* 2^27 */
+   consts->ShaderStorageBufferOffsetAlignment = 256;
+
    /* GL_ARB_explicit_uniform_location, GL_MAX_UNIFORM_LOCATIONS */
    consts->MaxUserAssignableUniformLocations =
       4 * MESA_SHADER_STAGES * MAX_UNIFORMS;
@@ -724,6 +716,14 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api)
 
    /** GL_KHR_context_flush_control */
    consts->ContextReleaseBehavior = GL_CONTEXT_RELEASE_BEHAVIOR_FLUSH;
+
+   /** GL_ARB_tessellation_shader */
+   consts->MaxTessGenLevel = MAX_TESS_GEN_LEVEL;
+   consts->MaxPatchVertices = MAX_PATCH_VERTICES;
+   consts->Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits = MAX_TEXTURE_IMAGE_UNITS;
+   consts->Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits = MAX_TEXTURE_IMAGE_UNITS;
+   consts->MaxTessPatchComponents = MAX_TESS_PATCH_COMPONENTS;
+   consts->MaxTessControlTotalOutputComponents = MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS;
 }
 
 
@@ -1331,6 +1331,8 @@ _mesa_free_context_data( struct gl_context *ctx )
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current, NULL);
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._TnlProgram, NULL);
 
+   _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current, NULL);
+   _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current, NULL);
    _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
 
    _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, NULL);
diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
index 6f3c941016f..0f7529ad975 100644
--- a/src/mesa/main/context.h
+++ b/src/mesa/main/context.h
@@ -343,6 +343,26 @@ _mesa_has_compute_shaders(const struct gl_context *ctx)
       (ctx->API == API_OPENGLES2 && ctx->Version >= 31);
 }
 
+/**
+ * Checks if the context supports shader subroutines.
+ */
+static inline bool
+_mesa_has_shader_subroutine(const struct gl_context *ctx)
+{
+   return ctx->API == API_OPENGL_CORE &&
+      (ctx->Version >= 40 || ctx->Extensions.ARB_shader_subroutine);
+}
+
+/**
+ * Checks if the context supports tessellation.
+ */
+static inline GLboolean
+_mesa_has_tessellation(const struct gl_context *ctx)
+{
+   return ctx->API == API_OPENGL_CORE &&
+          ctx->Extensions.ARB_tessellation_shader;
+}
+
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index e8732c6175b..05bc50dd2c6 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -93,7 +93,7 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCopyImageSubData(%sTarget = %s)", dbg_prefix,
-                  _mesa_lookup_enum_by_nr(*target));
+                  _mesa_enum_to_string(*target));
       return false;
    }
 
@@ -159,7 +159,7 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
       if ((*tex_obj)->Target != *target) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCopyImageSubData(%sTarget = %s)", dbg_prefix,
-                     _mesa_lookup_enum_by_nr(*target));
+                     _mesa_enum_to_string(*target));
          return false;
       }
 
@@ -416,9 +416,9 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
       _mesa_debug(ctx, "glCopyImageSubData(%u, %s, %d, %d, %d, %d, "
                                           "%u, %s, %d, %d, %d, %d, "
                                           "%d, %d, %d)\n",
-                  srcName, _mesa_lookup_enum_by_nr(srcTarget), srcLevel,
+                  srcName, _mesa_enum_to_string(srcTarget), srcLevel,
                   srcX, srcY, srcZ,
-                  dstName, _mesa_lookup_enum_by_nr(dstTarget), dstLevel,
+                  dstName, _mesa_enum_to_string(dstTarget), dstLevel,
                   dstX, dstY, dstZ,
                   srcWidth, srcHeight, srcWidth);
 
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index d783e34222f..87eb63ea374 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -232,11 +232,13 @@ struct dd_function_table {
 
 
    /**
-    * Called by glGetTexImage().
+    * Called by glGetTexImage(), glGetTextureSubImage().
     */
-   void (*GetTexImage)( struct gl_context *ctx,
-                        GLenum format, GLenum type, GLvoid *pixels,
-                        struct gl_texture_image *texImage );
+   void (*GetTexSubImage)(struct gl_context *ctx,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLsizei depth,
+                          GLenum format, GLenum type, GLvoid *pixels,
+                          struct gl_texture_image *texImage);
 
    /**
     * Called by glClearTex[Sub]Image
@@ -326,16 +328,19 @@ struct dd_function_table {
    void (*CompressedTexSubImage)(struct gl_context *ctx, GLuint dims,
                                  struct gl_texture_image *texImage,
                                  GLint xoffset, GLint yoffset, GLint zoffset,
-                                 GLsizei width, GLint height, GLint depth,
+                                 GLsizei width, GLsizei height, GLsizei depth,
                                  GLenum format,
                                  GLsizei imageSize, const GLvoid *data);
 
    /**
     * Called by glGetCompressedTexImage.
     */
-   void (*GetCompressedTexImage)(struct gl_context *ctx,
-                                 struct gl_texture_image *texImage,
-                                 GLvoid *data);
+   void (*GetCompressedTexSubImage)(struct gl_context *ctx,
+                                    struct gl_texture_image *texImage,
+                                    GLint xoffset, GLint yoffset,
+                                    GLint zoffset, GLsizei width,
+                                    GLsizei height, GLsizei depth,
+                                    GLvoid *data);
    /*@}*/
 
    /**
diff --git a/src/mesa/main/debug.c b/src/mesa/main/debug.c
index c93e84a04d0..5ca7d5ce500 100644
--- a/src/mesa/main/debug.c
+++ b/src/mesa/main/debug.c
@@ -272,7 +272,9 @@ write_texture_image(struct gl_texture_object *texObj,
       store = ctx->Pack; /* save */
       ctx->Pack = ctx->DefaultPacking;
 
-      ctx->Driver.GetTexImage(ctx, GL_RGBA, GL_UNSIGNED_BYTE, buffer, img);
+      ctx->Driver.GetTexSubImage(ctx,
+                                 0, 0, 0, img->Width, img->Height, img->Depth,
+                                 GL_RGBA, GL_UNSIGNED_BYTE, buffer, img);
 
       /* make filename */
       _mesa_snprintf(s, sizeof(s), "/tmp/tex%u.l%u.f%u.ppm", texObj->Name, level, face);
@@ -411,7 +413,7 @@ dump_renderbuffer(const struct gl_renderbuffer *rb, GLboolean writeImage)
 {
    printf("Renderbuffer %u: %u x %u  IntFormat = %s\n",
 	  rb->Name, rb->Width, rb->Height,
-	  _mesa_lookup_enum_by_nr(rb->InternalFormat));
+	  _mesa_enum_to_string(rb->InternalFormat));
    if (writeImage) {
       _mesa_write_renderbuffer_image(rb);
    }
diff --git a/src/mesa/main/depth.c b/src/mesa/main/depth.c
index bb4591cf152..c3534407599 100644
--- a/src/mesa/main/depth.c
+++ b/src/mesa/main/depth.c
@@ -63,7 +63,7 @@ _mesa_DepthFunc( GLenum func )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_lookup_enum_by_nr(func));
+      _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_enum_to_string(func));
 
    if (ctx->Depth.Func == func)
       return;
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index aafe486fb60..5554738d1a3 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -9000,7 +9000,7 @@ _mesa_NewList(GLuint name, GLenum mode)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glNewList %u %s\n", name,
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(mode));
 
    if (name == 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glNewList");
@@ -9688,7 +9688,7 @@ _mesa_initialize_save_table(const struct gl_context *ctx)
 static const char *
 enum_string(GLenum k)
 {
-   return _mesa_lookup_enum_by_nr(k);
+   return _mesa_enum_to_string(k);
 }
 
 
@@ -9827,19 +9827,19 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
             break;
          case OPCODE_BIND_TEXTURE:
             fprintf(f, "BindTexture %s %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui), n[2].ui);
+                         _mesa_enum_to_string(n[1].ui), n[2].ui);
             break;
          case OPCODE_SHADE_MODEL:
-            fprintf(f, "ShadeModel %s\n", _mesa_lookup_enum_by_nr(n[1].ui));
+            fprintf(f, "ShadeModel %s\n", _mesa_enum_to_string(n[1].ui));
             break;
          case OPCODE_MAP1:
             fprintf(f, "Map1 %s %.3f %.3f %d %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui),
+                         _mesa_enum_to_string(n[1].ui),
                          n[2].f, n[3].f, n[4].i, n[5].i);
             break;
          case OPCODE_MAP2:
             fprintf(f, "Map2 %s %.3f %.3f %.3f %.3f %d %d %d %d\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui),
+                         _mesa_enum_to_string(n[1].ui),
                          n[2].f, n[3].f, n[4].f, n[5].f,
                          n[6].i, n[7].i, n[8].i, n[9].i);
             break;
@@ -9918,7 +9918,7 @@ print_list(struct gl_context *ctx, GLuint list, const char *fname)
 
          case OPCODE_PROVOKING_VERTEX:
             fprintf(f, "ProvokingVertex %s\n",
-                         _mesa_lookup_enum_by_nr(n[1].ui));
+                         _mesa_enum_to_string(n[1].ui));
             break;
 
             /*
diff --git a/src/mesa/main/drawpix.c b/src/mesa/main/drawpix.c
index 55035f214b3..720a082ce6d 100644
--- a/src/mesa/main/drawpix.c
+++ b/src/mesa/main/drawpix.c
@@ -53,10 +53,10 @@ _mesa_DrawPixels( GLsizei width, GLsizei height,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glDrawPixels(%d, %d, %s, %s, %p) // to %s at %d, %d\n",
                   width, height,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type),
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type),
                   pixels,
-                  _mesa_lookup_enum_by_nr(ctx->DrawBuffer->ColorDrawBuffer[0]),
+                  _mesa_enum_to_string(ctx->DrawBuffer->ColorDrawBuffer[0]),
                   IROUND(ctx->Current.RasterPos[0]),
                   IROUND(ctx->Current.RasterPos[1]));
 
@@ -96,8 +96,8 @@ _mesa_DrawPixels( GLsizei width, GLsizei height,
    err = _mesa_error_check_format_and_type(ctx, format, type);
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err, "glDrawPixels(invalid format %s and/or type %s)",
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       goto end;
    }
 
@@ -198,9 +198,9 @@ _mesa_CopyPixels( GLint srcx, GLint srcy, GLsizei width, GLsizei height,
       _mesa_debug(ctx,
                   "glCopyPixels(%d, %d, %d, %d, %s) // from %s to %s at %d, %d\n",
                   srcx, srcy, width, height,
-                  _mesa_lookup_enum_by_nr(type),
-                  _mesa_lookup_enum_by_nr(ctx->ReadBuffer->ColorReadBuffer),
-                  _mesa_lookup_enum_by_nr(ctx->DrawBuffer->ColorDrawBuffer[0]),
+                  _mesa_enum_to_string(type),
+                  _mesa_enum_to_string(ctx->ReadBuffer->ColorReadBuffer),
+                  _mesa_enum_to_string(ctx->DrawBuffer->ColorDrawBuffer[0]),
                   IROUND(ctx->Current.RasterPos[0]),
                   IROUND(ctx->Current.RasterPos[1]));
 
@@ -218,7 +218,7 @@ _mesa_CopyPixels( GLint srcx, GLint srcy, GLsizei width, GLsizei height,
        type != GL_STENCIL &&
        type != GL_DEPTH_STENCIL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glCopyPixels(type=%s)",
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(type));
       return;
    }
 
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index 9008a386343..42f67990784 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -146,7 +146,7 @@ client_state(struct gl_context *ctx, GLenum cap, GLboolean state)
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "gl%sClientState(%s)",
-               state ? "Enable" : "Disable", _mesa_lookup_enum_by_nr(cap));
+               state ? "Enable" : "Disable", _mesa_enum_to_string(cap));
 }
 
 
@@ -283,7 +283,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "%s %s (newstate is %x)\n",
                   state ? "glEnable" : "glDisable",
-                  _mesa_lookup_enum_by_nr(cap),
+                  _mesa_enum_to_string(cap),
                   ctx->NewState);
 
    switch (cap) {
@@ -1001,7 +1001,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
 
       /* ARB_texture_multisample */
       case GL_SAMPLE_MASK:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles31(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_texture_multisample, cap);
          if (ctx->Multisample.SampleMask == state)
@@ -1022,7 +1022,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "gl%s(%s)",
-               state ? "Enable" : "Disable", _mesa_lookup_enum_by_nr(cap));
+               state ? "Enable" : "Disable", _mesa_enum_to_string(cap));
 }
 
 
@@ -1101,7 +1101,7 @@ _mesa_set_enablei(struct gl_context *ctx, GLenum cap,
 invalid_enum_error:
     _mesa_error(ctx, GL_INVALID_ENUM, "%s(cap=%s)",
                 state ? "glEnablei" : "glDisablei",
-                _mesa_lookup_enum_by_nr(cap));
+                _mesa_enum_to_string(cap));
 }
 
 
@@ -1143,7 +1143,7 @@ _mesa_IsEnabledi( GLenum cap, GLuint index )
       return (ctx->Scissor.EnableFlags >> index) & 1;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glIsEnabledIndexed(cap=%s)",
-                  _mesa_lookup_enum_by_nr(cap));
+                  _mesa_enum_to_string(cap));
       return GL_FALSE;
    }
 }
@@ -1603,7 +1603,7 @@ _mesa_IsEnabled( GLenum cap )
 
       /* ARB_texture_multisample */
       case GL_SAMPLE_MASK:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles31(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_texture_multisample);
          return ctx->Multisample.SampleMask;
@@ -1623,6 +1623,6 @@ _mesa_IsEnabled( GLenum cap )
 
 invalid_enum_error:
    _mesa_error(ctx, GL_INVALID_ENUM, "glIsEnabled(%s)",
-               _mesa_lookup_enum_by_nr(cap));
+               _mesa_enum_to_string(cap));
    return GL_FALSE;
 }
diff --git a/src/mesa/main/enums.h b/src/mesa/main/enums.h
index 66bdd53bbab..0e18cd407e9 100644
--- a/src/mesa/main/enums.h
+++ b/src/mesa/main/enums.h
@@ -42,7 +42,7 @@ extern "C" {
 #endif
 
 
-extern const char *_mesa_lookup_enum_by_nr( int nr );
+extern const char *_mesa_enum_to_string( int nr );
 
 /* Get the name of an enum given that it is a primitive type.  Avoids
  * GL_FALSE/GL_POINTS ambiguity and others.
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index b3406665d94..f720de316e4 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -1314,7 +1314,7 @@ flush_delayed_errors( struct gl_context *ctx )
    if (ctx->ErrorDebugCount) {
       _mesa_snprintf(s, MAX_DEBUG_MESSAGE_LENGTH, "%d similar %s errors", 
                      ctx->ErrorDebugCount,
-                     _mesa_lookup_enum_by_nr(ctx->ErrorValue));
+                     _mesa_enum_to_string(ctx->ErrorValue));
 
       output_if_debug("Mesa", s, GL_TRUE);
 
@@ -1503,7 +1503,7 @@ _mesa_error( struct gl_context *ctx, GLenum error, const char *fmtString, ... )
       }
 
       len = _mesa_snprintf(s2, MAX_DEBUG_MESSAGE_LENGTH, "%s in %s",
-                           _mesa_lookup_enum_by_nr(error), s);
+                           _mesa_enum_to_string(error), s);
       if (len >= MAX_DEBUG_MESSAGE_LENGTH) {
          /* Same as above. */
          assert(0);
diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index 24f234f7f10..81e47a8b8c1 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -37,6 +37,7 @@
 
 
 #include <stdio.h>
+#include <stdarg.h>
 #include "compiler.h"
 #include "glheader.h"
 #include "mtypes.h"
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 4176a69ed7c..d934d19c3e7 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -121,6 +121,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_framebuffer_object",                  o(ARB_framebuffer_object),                  GL,             2005 },
    { "GL_ARB_framebuffer_sRGB",                    o(EXT_framebuffer_sRGB),                    GL,             1998 },
    { "GL_ARB_get_program_binary",                  o(dummy_true),                              GL,             2010 },
+   { "GL_ARB_get_texture_sub_image",               o(dummy_true),                              GL,             2014 },
    { "GL_ARB_gpu_shader5",                         o(ARB_gpu_shader5),                         GLC,            2010 },
    { "GL_ARB_gpu_shader_fp64",                     o(ARB_gpu_shader_fp64),                     GLC,            2010 },
    { "GL_ARB_half_float_pixel",                    o(dummy_true),                              GL,             2003 },
@@ -154,6 +155,8 @@ static const struct extension extension_table[] = {
    { "GL_ARB_shader_objects",                      o(dummy_true),                              GL,             2002 },
    { "GL_ARB_shader_precision",                    o(ARB_shader_precision),                    GL,             2010 },
    { "GL_ARB_shader_stencil_export",               o(ARB_shader_stencil_export),               GL,             2009 },
+   { "GL_ARB_shader_storage_buffer_object",        o(ARB_shader_storage_buffer_object),        GL,             2012 },
+   { "GL_ARB_shader_subroutine",                   o(ARB_shader_subroutine),                   GLC,            2010 },
    { "GL_ARB_shader_texture_lod",                  o(ARB_shader_texture_lod),                  GL,             2009 },
    { "GL_ARB_shading_language_100",                o(dummy_true),                              GLL,            2003 },
    { "GL_ARB_shading_language_packing",            o(ARB_shading_language_packing),            GL,             2011 },
@@ -382,6 +385,9 @@ static const struct extension extension_table[] = {
    { "GL_NV_point_sprite",                         o(NV_point_sprite),                         GL,             2001 },
    { "GL_NV_primitive_restart",                    o(NV_primitive_restart),                    GLL,            2002 },
    { "GL_NV_read_buffer",                          o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_depth",                           o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_depth_stencil",                   o(dummy_true),                              ES2,            2011 },
+   { "GL_NV_read_stencil",                         o(dummy_true),                              ES2,            2011 },
    { "GL_NV_texgen_reflection",                    o(dummy_true),                              GLL,            1999 },
    { "GL_NV_texture_barrier",                      o(NV_texture_barrier),                      GL,             2009 },
    { "GL_NV_texture_env_combine4",                 o(NV_texture_env_combine4),                 GLL,            1999 },
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index f8dcf122d99..841834030df 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -2007,7 +2007,7 @@ renderbuffer_storage(struct gl_context *ctx, struct gl_renderbuffer *rb,
    baseFormat = _mesa_base_fbo_format(ctx, internalFormat);
    if (baseFormat == 0) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(internalFormat=%s)",
-                  func, _mesa_lookup_enum_by_nr(internalFormat));
+                  func, _mesa_enum_to_string(internalFormat));
       return;
    }
 
@@ -2095,12 +2095,12 @@ renderbuffer_storage_named(GLuint renderbuffer, GLenum internalFormat,
       if (samples == NO_SAMPLES)
          _mesa_debug(ctx, "%s(%u, %s, %d, %d)\n",
                      func, renderbuffer,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(internalFormat),
                      width, height);
       else
          _mesa_debug(ctx, "%s(%u, %s, %d, %d, %d)\n",
                      func, renderbuffer,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(internalFormat),
                      width, height, samples);
    }
 
@@ -2131,14 +2131,14 @@ renderbuffer_storage_target(GLenum target, GLenum internalFormat,
       if (samples == NO_SAMPLES)
          _mesa_debug(ctx, "%s(%s, %s, %d, %d)\n",
                      func,
-                     _mesa_lookup_enum_by_nr(target),
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target),
+                     _mesa_enum_to_string(internalFormat),
                      width, height);
       else
          _mesa_debug(ctx, "%s(%s, %s, %d, %d, %d)\n",
                      func,
-                     _mesa_lookup_enum_by_nr(target),
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target),
+                     _mesa_enum_to_string(internalFormat),
                      width, height, samples);
    }
 
@@ -2311,7 +2311,7 @@ get_render_buffer_parameteriv(struct gl_context *ctx,
       /* fallthrough */
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname=%s)", func,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 }
@@ -2694,13 +2694,13 @@ _mesa_CheckFramebufferStatus(GLenum target)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glCheckFramebufferStatus(%s)\n",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
 
    fb = get_framebuffer_target(ctx, target);
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCheckFramebufferStatus(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return 0;
    }
 
@@ -2732,7 +2732,7 @@ _mesa_CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target)
       default:
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCheckNamedFramebufferStatus(invalid target %s)",
-                     _mesa_lookup_enum_by_nr(target));
+                     _mesa_enum_to_string(target));
          return 0;
    }
 
@@ -2851,7 +2851,7 @@ check_layered_texture_target(struct gl_context *ctx, GLenum target,
 
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "%s(invalid texture target %s)", caller,
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
    return false;
 }
 
@@ -2893,7 +2893,7 @@ check_texture_target(struct gl_context *ctx, GLenum target,
 
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "%s(invalid texture target %s)", caller,
-               _mesa_lookup_enum_by_nr(target));
+               _mesa_enum_to_string(target));
    return false;
 }
 
@@ -2944,8 +2944,9 @@ check_textarget(struct gl_context *ctx, int dims, GLenum target,
          break;
       case GL_TEXTURE_2D_MULTISAMPLE:
       case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-         err = _mesa_is_gles(ctx)
-               || !ctx->Extensions.ARB_texture_multisample;
+         err = (_mesa_is_gles(ctx) ||
+                !ctx->Extensions.ARB_texture_multisample) &&
+               !_mesa_is_gles31(ctx);
          break;
       default:
          err = true;
@@ -2962,7 +2963,7 @@ check_textarget(struct gl_context *ctx, int dims, GLenum target,
    if (err) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(invalid textarget %s)",
-                  caller, _mesa_lookup_enum_by_nr(textarget));
+                  caller, _mesa_enum_to_string(textarget));
       return false;
    }
 
@@ -3074,7 +3075,7 @@ _mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
    att = get_attachment(ctx, fb, attachment);
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3157,7 +3158,7 @@ framebuffer_texture_with_dims(int dims, GLenum target,
    fb = get_framebuffer_target(ctx, target);
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3225,7 +3226,7 @@ _mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferTextureLayer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3304,7 +3305,7 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb;
    struct gl_texture_object *texObj;
-   GLboolean layered;
+   GLboolean layered = GL_FALSE;
 
    const char *func = "FramebufferTexture";
 
@@ -3319,7 +3320,7 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferTexture(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3347,7 +3348,7 @@ _mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb;
    struct gl_texture_object *texObj;
-   GLboolean layered;
+   GLboolean layered = GL_FALSE;
 
    const char *func = "glNamedFramebufferTexture";
 
@@ -3400,7 +3401,7 @@ _mesa_framebuffer_renderbuffer(struct gl_context *ctx,
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(invalid attachment %s)", func,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3440,7 +3441,7 @@ _mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glFramebufferRenderbuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -3539,7 +3540,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
           attachment != GL_DEPTH && attachment != GL_STENCIL) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "%s(invalid attachment %s)", caller,
-                     _mesa_lookup_enum_by_nr(attachment));
+                     _mesa_enum_to_string(attachment));
          return;
       }
       /* the default / window-system FBO */
@@ -3552,7 +3553,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
 
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
-                  _mesa_lookup_enum_by_nr(attachment));
+                  _mesa_enum_to_string(attachment));
       return;
    }
 
@@ -3609,7 +3610,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3626,7 +3627,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3637,7 +3638,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
          goto invalid_pname_enum;
       } else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       } else if (att->Type == GL_TEXTURE) {
          if (att->Texture && (att->Texture->Target == GL_TEXTURE_3D ||
              att->Texture->Target == GL_TEXTURE_2D_ARRAY)) {
@@ -3659,7 +3660,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          if (ctx->Extensions.EXT_framebuffer_sRGB) {
@@ -3682,7 +3683,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else {
          mesa_format format = att->Renderbuffer->Format;
@@ -3734,7 +3735,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
       }
       else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       }
       else if (att->Texture) {
          const struct gl_texture_image *texImage =
@@ -3763,7 +3764,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
          *params = att->Layered;
       } else if (att->Type == GL_NONE) {
          _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
       } else {
          goto invalid_pname_enum;
       }
@@ -3776,7 +3777,7 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
 
 invalid_pname_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname %s)", caller,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return;
 }
 
@@ -3792,7 +3793,7 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
    if (!buffer) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetFramebufferAttachmentParameteriv(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4009,7 +4010,7 @@ invalidate_framebuffer_storage(struct gl_context *ctx,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", name,
-               _mesa_lookup_enum_by_nr(attachments[i]));
+               _mesa_enum_to_string(attachments[i]));
    return;
 }
 
@@ -4026,7 +4027,7 @@ _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glInvalidateSubFramebuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4076,7 +4077,7 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glInvalidateFramebuffer(invalid target %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4152,7 +4153,7 @@ _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
    if (!fb) {
       _mesa_error(ctx, GL_INVALID_ENUM,
          "glDiscardFramebufferEXT(target %s)",
-         _mesa_lookup_enum_by_nr(target));
+         _mesa_enum_to_string(target));
       return;
    }
 
@@ -4189,5 +4190,5 @@ _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glDiscardFramebufferEXT(attachment %s)",
-              _mesa_lookup_enum_by_nr(attachments[i]));
+              _mesa_enum_to_string(attachments[i]));
 }
diff --git a/src/mesa/main/feedback.c b/src/mesa/main/feedback.c
index 6bc4294f9c7..699e2a855a3 100644
--- a/src/mesa/main/feedback.c
+++ b/src/mesa/main/feedback.c
@@ -415,7 +415,7 @@ _mesa_RenderMode( GLenum mode )
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glRenderMode %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glRenderMode %s\n", _mesa_enum_to_string(mode));
 
    FLUSH_VERTICES(ctx, _NEW_RENDERMODE);
 
diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c
index 70adaf88551..95b428dca3e 100644
--- a/src/mesa/main/ffvertex_prog.c
+++ b/src/mesa/main/ffvertex_prog.c
@@ -189,15 +189,15 @@ static void make_state_key( struct gl_context *ctx, struct state_key *key )
 	 if (light->Enabled) {
 	    key->unit[i].light_enabled = 1;
 
-	    if (light->EyePosition[3] == 0.0)
+	    if (light->EyePosition[3] == 0.0F)
 	       key->unit[i].light_eyepos3_is_zero = 1;
 
-	    if (light->SpotCutoff == 180.0)
+	    if (light->SpotCutoff == 180.0F)
 	       key->unit[i].light_spotcutoff_is_180 = 1;
 
-	    if (light->ConstantAttenuation != 1.0 ||
-		light->LinearAttenuation != 0.0 ||
-		light->QuadraticAttenuation != 0.0)
+	    if (light->ConstantAttenuation != 1.0F ||
+		light->LinearAttenuation != 0.0F ||
+		light->QuadraticAttenuation != 0.0F)
 	       key->unit[i].light_attenuated = 1;
 	 }
       }
diff --git a/src/mesa/main/fog.c b/src/mesa/main/fog.c
index 3bce289e785..45f343d61c8 100644
--- a/src/mesa/main/fog.c
+++ b/src/mesa/main/fog.c
@@ -115,7 +115,7 @@ _mesa_Fogfv( GLenum pname, const GLfloat *params )
 	 ctx->Fog.Mode = m;
 	 break;
       case GL_FOG_DENSITY:
-	 if (*params<0.0) {
+	 if (*params<0.0F) {
 	    _mesa_error( ctx, GL_INVALID_VALUE, "glFog" );
             return;
 	 }
diff --git a/src/mesa/main/format_parser.py b/src/mesa/main/format_parser.py
index 11184f78e2c..799b14f0b1c 100755
--- a/src/mesa/main/format_parser.py
+++ b/src/mesa/main/format_parser.py
@@ -40,9 +40,6 @@ SRGB = 'srgb'
 YUV = 'yuv'
 ZS = 'zs'
 
-def is_power_of_two(x):
-   return not bool(x & (x - 1))
-
 VERY_LARGE = 99999999999999999999999
 
 class Channel:
@@ -100,10 +97,6 @@ class Channel:
       else:
          return 1
 
-   def is_power_of_two(self):
-      """Returns true if the size of this channel is a power of two."""
-      return is_power_of_two(self.size)
-
    def datatype(self):
       """Returns the datatype corresponding to a channel type and size"""
       return _get_datatype(self.type, self.size)
diff --git a/src/mesa/main/format_utils.h b/src/mesa/main/format_utils.h
index 7f500ec78da..618f43d0aaa 100644
--- a/src/mesa/main/format_utils.h
+++ b/src/mesa/main/format_utils.h
@@ -33,6 +33,7 @@
 
 #include "imports.h"
 #include "macros.h"
+#include "util/rounding.h"
 
 extern const mesa_array_format RGBA32_FLOAT;
 extern const mesa_array_format RGBA8_UBYTE;
@@ -84,7 +85,7 @@ _mesa_float_to_unorm(float x, unsigned dst_bits)
    else if (x > 1.0f)
       return MAX_UINT(dst_bits);
    else
-      return F_TO_I(x * MAX_UINT(dst_bits));
+      return _mesa_lroundevenf(x * MAX_UINT(dst_bits));
 }
 
 static inline unsigned
@@ -98,7 +99,7 @@ _mesa_unorm_to_unorm(unsigned x, unsigned src_bits, unsigned dst_bits)
 {
    if (src_bits < dst_bits) {
       return EXTEND_NORMALIZED_INT(x, src_bits, dst_bits);
-   } else {
+   } else if (src_bits > dst_bits) {
       unsigned src_half = (1 << (src_bits - 1)) - 1;
 
       if (src_bits + dst_bits > sizeof(x) * 8) {
@@ -108,6 +109,8 @@ _mesa_unorm_to_unorm(unsigned x, unsigned src_bits, unsigned dst_bits)
       } else {
          return (x * MAX_UINT(dst_bits) + src_half) / MAX_UINT(src_bits);
       }
+   } else {
+      return x;
    }
 }
 
@@ -128,7 +131,7 @@ _mesa_float_to_snorm(float x, unsigned dst_bits)
    else if (x > 1.0f)
       return MAX_INT(dst_bits);
    else
-      return F_TO_I(x * MAX_INT(dst_bits));
+      return _mesa_lroundevenf(x * MAX_INT(dst_bits));
 }
 
 static inline int
diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index 7741cabada1..85f7b6b5664 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -74,13 +74,15 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       /* These enums are only valid if ARB_texture_multisample is supported */
-      if (_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample)
+      if ((_mesa_is_desktop_gl(ctx) &&
+           ctx->Extensions.ARB_texture_multisample) ||
+          _mesa_is_gles31(ctx))
          break;
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -107,7 +109,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
        _mesa_base_fbo_format(ctx, internalformat) == 0) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(internalformat=%s)",
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -119,7 +121,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    if (bufSize < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glGetInternalformativ(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -168,7 +170,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(pname=%s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index baeb1bfe5de..d7b2bae59e7 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -354,14 +354,22 @@ _mesa_array_format_flip_channels(mesa_array_format format)
       return format;
 
    if (num_channels == 2) {
-      _mesa_array_format_set_swizzle(&format, swizzle[1], swizzle[0],
-                                     swizzle[2], swizzle[3]);
+      /* Assert that the swizzle makes sense for 2 channels */
+      for (unsigned i = 0; i < 4; i++)
+         assert(swizzle[i] != 2 && swizzle[i] != 3);
+
+      static const uint8_t flip_xy[6] = { 1, 0, 2, 3, 4, 5 };
+      _mesa_array_format_set_swizzle(&format,
+                                     flip_xy[swizzle[0]], flip_xy[swizzle[1]],
+                                     flip_xy[swizzle[2]], flip_xy[swizzle[3]]);
       return format;
    }
 
    if (num_channels == 4) {
-      _mesa_array_format_set_swizzle(&format, swizzle[3], swizzle[2],
-                                     swizzle[1], swizzle[0]);
+      static const uint8_t flip[6] = { 3, 2, 1, 0, 4, 5 };
+      _mesa_array_format_set_swizzle(&format,
+                                     flip[swizzle[0]], flip[swizzle[1]],
+                                     flip[swizzle[2]], flip[swizzle[3]]);
       return format;
    }
 
@@ -372,10 +380,11 @@ uint32_t
 _mesa_format_to_array_format(mesa_format format)
 {
    const struct gl_format_info *info = _mesa_get_format_info(format);
-   if (_mesa_little_endian())
-      return info->ArrayFormat;
-   else
+   if (info->ArrayFormat && !_mesa_little_endian() &&
+       info->Layout == MESA_FORMAT_LAYOUT_PACKED)
       return _mesa_array_format_flip_channels(info->ArrayFormat);
+   else
+      return info->ArrayFormat;
 }
 
 static struct hash_table *format_array_format_table;
diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index 7e451caf0ff..d938e6ad513 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -191,6 +191,11 @@ static inline void
 _mesa_array_format_set_swizzle(mesa_array_format *f,
                                int32_t x, int32_t y, int32_t z, int32_t w)
 {
+   *f &= ~(MESA_ARRAY_FORMAT_SWIZZLE_X_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_Y_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_Z_MASK |
+           MESA_ARRAY_FORMAT_SWIZZLE_W_MASK);
+
    *f |= ((x << 8 ) & MESA_ARRAY_FORMAT_SWIZZLE_X_MASK) |
          ((y << 11) & MESA_ARRAY_FORMAT_SWIZZLE_Y_MASK) |
          ((z << 14) & MESA_ARRAY_FORMAT_SWIZZLE_Z_MASK) |
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 77c04b8dab8..37e2c29c89c 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -938,7 +938,7 @@ _mesa_print_framebuffer(const struct gl_framebuffer *fb)
 
    fprintf(stderr, "Mesa Framebuffer %u at %p\n", fb->Name, (void *) fb);
    fprintf(stderr, "  Size: %u x %u  Status: %s\n", fb->Width, fb->Height,
-           _mesa_lookup_enum_by_nr(fb->_Status));
+           _mesa_enum_to_string(fb->_Status));
    fprintf(stderr, "  Attachments:\n");
 
    for (i = 0; i < BUFFER_COUNT; i++) {
diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index 9aef090194e..c18f9d5223f 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -83,7 +83,7 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx,
 
    if (error) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGenerate%sMipmap(target=%s)",
-                  suffix, _mesa_lookup_enum_by_nr(target));
+                  suffix, _mesa_enum_to_string(target));
       return;
    }
 
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 3d6d63916b3..307a5ffbd1c 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -149,6 +149,8 @@ enum value_extra {
    EXTRA_EXT_UBO_GS4,
    EXTRA_EXT_ATOMICS_GS4,
    EXTRA_EXT_SHADER_IMAGE_GS4,
+   EXTRA_EXT_ATOMICS_TESS,
+   EXTRA_EXT_SHADER_IMAGE_TESS,
 };
 
 #define NO_EXTRA NULL
@@ -349,12 +351,58 @@ static const int extra_ARB_shader_image_load_store_and_geometry_shader[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_shader_atomic_counters_and_tessellation[] = {
+   EXTRA_EXT_ATOMICS_TESS,
+   EXTRA_END
+};
+
+static const int extra_ARB_shader_image_load_store_and_tessellation[] = {
+   EXTRA_EXT_SHADER_IMAGE_TESS,
+   EXTRA_END
+};
+
 static const int extra_ARB_draw_indirect_es31[] = {
    EXT(ARB_draw_indirect),
    EXTRA_API_ES31,
    EXTRA_END
 };
 
+static const int extra_ARB_shader_image_load_store_es31[] = {
+   EXT(ARB_shader_image_load_store),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_shader_atomic_counters_es31[] = {
+   EXT(ARB_shader_atomic_counters),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_texture_multisample_es31[] = {
+   EXT(ARB_texture_multisample),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_texture_gather_es31[] = {
+   EXT(ARB_texture_gather),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_compute_shader_es31[] = {
+   EXT(ARB_compute_shader),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
+static const int extra_ARB_explicit_uniform_location_es31[] = {
+   EXT(ARB_explicit_uniform_location),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
@@ -401,6 +449,8 @@ EXTRA_EXT(ARB_explicit_uniform_location);
 EXTRA_EXT(ARB_clip_control);
 EXTRA_EXT(EXT_polygon_offset_clamp);
 EXTRA_EXT(ARB_framebuffer_no_attachments);
+EXTRA_EXT(ARB_tessellation_shader);
+EXTRA_EXT(ARB_shader_subroutine);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -626,7 +676,7 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       break;
 
    case GL_EDGE_FLAG:
-      v->value_bool = ctx->Current.Attrib[VERT_ATTRIB_EDGEFLAG][0] == 1.0;
+      v->value_bool = ctx->Current.Attrib[VERT_ATTRIB_EDGEFLAG][0] == 1.0F;
       break;
 
    case GL_READ_BUFFER:
@@ -1149,6 +1199,16 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
          api_found = (ctx->Extensions.ARB_shader_image_load_store &&
                       _mesa_has_geometry_shaders(ctx));
          break;
+      case EXTRA_EXT_ATOMICS_TESS:
+         api_check = GL_TRUE;
+         api_found = ctx->Extensions.ARB_shader_atomic_counters &&
+                     _mesa_has_tessellation(ctx);
+         break;
+      case EXTRA_EXT_SHADER_IMAGE_TESS:
+         api_check = GL_TRUE;
+         api_found = ctx->Extensions.ARB_shader_image_load_store &&
+                     _mesa_has_tessellation(ctx);
+         break;
       case EXTRA_END:
 	 break;
       default: /* *e is a offset into the extension struct */
@@ -1161,7 +1221,7 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
 
    if (api_check && !api_found) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-                  _mesa_lookup_enum_by_nr(d->pname));
+                  _mesa_enum_to_string(d->pname));
       return GL_FALSE;
    }
 
@@ -1208,10 +1268,13 @@ find_value(const char *func, GLenum pname, void **p, union value *v)
     * value since it's compatible with GLES2 its entry in table_set[] is at the
     * end.
     */
-   STATIC_ASSERT(ARRAY_SIZE(table_set) == API_OPENGL_LAST + 2);
+   STATIC_ASSERT(ARRAY_SIZE(table_set) == API_OPENGL_LAST + 3);
    if (_mesa_is_gles3(ctx)) {
       api = API_OPENGL_LAST + 1;
    }
+   if (_mesa_is_gles31(ctx)) {
+      api = API_OPENGL_LAST + 2;
+   }
    mask = ARRAY_SIZE(table(api)) - 1;
    hash = (pname * prime_factor);
    while (1) {
@@ -1222,7 +1285,7 @@ find_value(const char *func, GLenum pname, void **p, union value *v)
        * any valid enum. */
       if (unlikely(idx == 0)) {
          _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
          return &error_value;
       }
 
@@ -2004,11 +2067,11 @@ find_value_indexed(const char *func, GLenum pname, GLuint index, union value *v)
 
  invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return TYPE_INVALID;
  invalid_value:
    _mesa_error(ctx, GL_INVALID_VALUE, "%s(pname=%s)", func,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
    return TYPE_INVALID;
 }
 
diff --git a/src/mesa/main/get_hash_generator.py b/src/mesa/main/get_hash_generator.py
index b200d197341..c777b782442 100644
--- a/src/mesa/main/get_hash_generator.py
+++ b/src/mesa/main/get_hash_generator.py
@@ -44,7 +44,7 @@ prime_factor = 89
 prime_step = 281
 hash_table_size = 1024
 
-gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3"])
+gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3", "GLES31"])
 
 def print_header():
    print "typedef const unsigned short table_t[%d];\n" % (hash_table_size)
@@ -68,6 +68,7 @@ api_enum = [
    'GLES2',
    'GL_CORE',
    'GLES3', # Not in gl_api enum in mtypes.h
+   'GLES31', # Not in gl_api enum in mtypes.h
 ]
 
 def api_index(api):
@@ -167,10 +168,13 @@ def generate_hash_tables(enum_list, enabled_apis, param_descriptors):
 
          for api in valid_apis:
             add_to_hash_table(tables[api], hash_val, len(params))
-            # Also add GLES2 items to the GLES3 hash table
+            # Also add GLES2 items to the GLES3 and GLES31 hash table
             if api == "GLES2":
                add_to_hash_table(tables["GLES3"], hash_val, len(params))
-
+               add_to_hash_table(tables["GLES31"], hash_val, len(params))
+            # Also add GLES3 items to the GLES31 hash table
+            if api == "GLES3":
+               add_to_hash_table(tables["GLES31"], hash_val, len(params))
          params.append(["GL_" + enum_name, param[1]])
 
    sorted_tables={}
@@ -206,7 +210,7 @@ if __name__ == '__main__':
       die("missing descriptor file (-f)\n")
 
    # generate the code for all APIs
-   enabled_apis = set(["GLES", "GLES2", "GLES3", "GL", "GL_CORE"])
+   enabled_apis = set(["GLES", "GLES2", "GLES3", "GLES31", "GL", "GL_CORE"])
 
    try:
       api_desc = gl_XML.parse_GL_API(api_desc_file)
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 74ff3ba6619..7dc92f10100 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -351,6 +351,9 @@ descriptor=[
 # GL_ARB_framebuffer_object
   [ "MAX_SAMPLES", "CONTEXT_INT(Const.MaxSamples), extra_ARB_framebuffer_object_EXT_framebuffer_multisample" ],
 
+# GL_ARB_sampler_objects / GL 3.3 / GLES 3.0
+  [ "SAMPLER_BINDING", "LOC_CUSTOM, TYPE_INT, GL_SAMPLER_BINDING, NO_EXTRA" ],
+
 # GL_ARB_sync
   [ "MAX_SERVER_WAIT_TIMEOUT", "CONTEXT_INT64(Const.MaxServerWaitTimeout), extra_ARB_sync" ],
 
@@ -404,9 +407,49 @@ descriptor=[
   [ "TEXTURE_EXTERNAL_OES", "LOC_CUSTOM, TYPE_BOOLEAN, 0, extra_OES_EGL_image_external" ],
 ]},
 
-{ "apis": ["GL", "GL_CORE", "GLES3"], "params": [
-# GL_ARB_sampler_objects / GL 3.3 / GLES 3.0
-  [ "SAMPLER_BINDING", "LOC_CUSTOM, TYPE_INT, GL_SAMPLER_BINDING, NO_EXTRA" ],
+# Enums in OpenGL and ES 3.1
+{ "apis": ["GL", "GL_CORE", "GLES31"], "params": [
+# GL_ARB_shader_image_load_store / GLES 3.1
+  [ "MAX_IMAGE_UNITS", "CONTEXT_INT(Const.MaxImageUnits), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_VERTEX_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_FRAGMENT_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+  [ "MAX_COMBINED_IMAGE_UNIFORMS", "CONTEXT_INT(Const.MaxCombinedImageUniforms), extra_ARB_shader_image_load_store_es31" ],
+
+# GL_ARB_shader_atomic_counters / GLES 3.1
+  [ "ATOMIC_COUNTER_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_ATOMIC_COUNTER_BUFFER_BINDINGS", "CONTEXT_INT(Const.MaxAtomicBufferBindings), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_ATOMIC_COUNTER_BUFFER_SIZE", "CONTEXT_INT(Const.MaxAtomicBufferSize), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_VERTEX_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_VERTEX_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_FRAGMENT_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_FRAGMENT_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_COMBINED_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.MaxCombinedAtomicBuffers), extra_ARB_shader_atomic_counters_es31" ],
+  [ "MAX_COMBINED_ATOMIC_COUNTERS", "CONTEXT_INT(Const.MaxCombinedAtomicCounters), extra_ARB_shader_atomic_counters_es31" ],
+
+# GL_ARB_texture_multisample / GLES 3.1
+  [ "TEXTURE_BINDING_2D_MULTISAMPLE", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_INDEX, extra_ARB_texture_multisample_es31" ],
+  [ "MAX_COLOR_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxColorTextureSamples), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_DEPTH_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxDepthTextureSamples), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_INTEGER_SAMPLES", "CONTEXT_INT(Const.MaxIntegerSamples), extra_ARB_texture_multisample_es31" ],
+  [ "SAMPLE_MASK", "CONTEXT_BOOL(Multisample.SampleMask), extra_ARB_texture_multisample_es31" ],
+  [ "MAX_SAMPLE_MASK_WORDS", "CONST(1), extra_ARB_texture_multisample_es31" ],
+
+# GL_ARB_texture_gather / GLES 3.1
+  [ "MIN_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MinProgramTextureGatherOffset), extra_ARB_texture_gather_es31"],
+  [ "MAX_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MaxProgramTextureGatherOffset), extra_ARB_texture_gather_es31"],
+
+# GL_ARB_compute_shader / GLES 3.1
+  [ "MAX_COMPUTE_WORK_GROUP_INVOCATIONS", "CONTEXT_INT(Const.MaxComputeWorkGroupInvocations), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_UNIFORM_BLOCKS", "CONST(MAX_COMPUTE_UNIFORM_BLOCKS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_TEXTURE_IMAGE_UNITS", "CONST(MAX_COMPUTE_TEXTURE_IMAGE_UNITS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_ATOMIC_COUNTERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTERS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_SHARED_MEMORY_SIZE", "CONST(MAX_COMPUTE_SHARED_MEMORY_SIZE), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader_es31" ],
+
+# GL_ARB_explicit_uniform_location / GLES 3.1
+  [ "MAX_UNIFORM_LOCATIONS", "CONTEXT_INT(Const.MaxUserAssignableUniformLocations), extra_ARB_explicit_uniform_location_es31" ],
 ]},
 
 # Enums in OpenGL Core profile and ES 3.1
@@ -498,7 +541,6 @@ descriptor=[
   [ "MAX_LIST_NESTING", "CONST(MAX_LIST_NESTING), NO_EXTRA" ],
   [ "MAX_NAME_STACK_DEPTH", "CONST(MAX_NAME_STACK_DEPTH), NO_EXTRA" ],
   [ "MAX_PIXEL_MAP_TABLE", "CONST(MAX_PIXEL_MAP_TABLE), NO_EXTRA" ],
-  [ "MAX_UNIFORM_LOCATIONS", "CONTEXT_INT(Const.MaxUserAssignableUniformLocations), extra_ARB_explicit_uniform_location" ],
   [ "NAME_STACK_DEPTH", "CONTEXT_INT(Select.NameStackDepth), NO_EXTRA" ],
   [ "PACK_LSB_FIRST", "CONTEXT_BOOL(Pack.LsbFirst), NO_EXTRA" ],
   [ "PACK_SWAP_BYTES", "CONTEXT_BOOL(Pack.SwapBytes), NO_EXTRA" ],
@@ -699,13 +741,7 @@ descriptor=[
   [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
 
 # GL_ARB_texture_multisample / GL 3.2
-  [ "TEXTURE_BINDING_2D_MULTISAMPLE", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_INDEX, extra_ARB_texture_multisample" ],
   [ "TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX, extra_ARB_texture_multisample" ],
-  [ "MAX_COLOR_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxColorTextureSamples), extra_ARB_texture_multisample" ],
-  [ "MAX_DEPTH_TEXTURE_SAMPLES", "CONTEXT_INT(Const.MaxDepthTextureSamples), extra_ARB_texture_multisample" ],
-  [ "MAX_INTEGER_SAMPLES", "CONTEXT_INT(Const.MaxIntegerSamples), extra_ARB_texture_multisample" ],
-  [ "SAMPLE_MASK", "CONTEXT_BOOL(Multisample.SampleMask), extra_ARB_texture_multisample" ],
-  [ "MAX_SAMPLE_MASK_WORDS", "CONST(1), extra_ARB_texture_multisample" ],
 
 # GL 3.0
   [ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ],
@@ -756,48 +792,23 @@ descriptor=[
   [ "TEXTURE_BINDING_CUBE_MAP_ARRAY_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_CUBE_ARRAY_INDEX, extra_ARB_texture_cube_map_array" ],
 
 # GL_ARB_texture_gather
-  [ "MIN_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MinProgramTextureGatherOffset), extra_ARB_texture_gather"],
-  [ "MAX_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MaxProgramTextureGatherOffset), extra_ARB_texture_gather"],
   [ "MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB", "CONTEXT_INT(Const.MaxProgramTextureGatherComponents), extra_ARB_texture_gather"],
 
 # GL_ARB_separate_shader_objects
   [ "PROGRAM_PIPELINE_BINDING", "LOC_CUSTOM, TYPE_INT, GL_PROGRAM_PIPELINE_BINDING, NO_EXTRA" ],
 
 # GL_ARB_shader_atomic_counters
-  [ "ATOMIC_COUNTER_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_shader_atomic_counters" ],
-  [ "MAX_ATOMIC_COUNTER_BUFFER_BINDINGS", "CONTEXT_INT(Const.MaxAtomicBufferBindings), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_ATOMIC_COUNTER_BUFFER_SIZE", "CONTEXT_INT(Const.MaxAtomicBufferSize), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_VERTEX_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_VERTEX_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_FRAGMENT_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_FRAGMENT_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters), extra_ARB_shader_atomic_counters" ],
   [ "MAX_GEOMETRY_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
   [ "MAX_GEOMETRY_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_geometry_shader" ],
-  [ "MAX_COMBINED_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.MaxCombinedAtomicBuffers), extra_ARB_shader_atomic_counters" ],
-  [ "MAX_COMBINED_ATOMIC_COUNTERS", "CONTEXT_INT(Const.MaxCombinedAtomicCounters), extra_ARB_shader_atomic_counters" ],
 
 # GL_ARB_vertex_attrib_binding
   [ "MAX_VERTEX_ATTRIB_RELATIVE_OFFSET", "CONTEXT_ENUM(Const.MaxVertexAttribRelativeOffset), NO_EXTRA" ],
   [ "MAX_VERTEX_ATTRIB_BINDINGS", "CONTEXT_ENUM(Const.MaxVertexAttribBindings), NO_EXTRA" ],
 
 # GL_ARB_shader_image_load_store
-  [ "MAX_IMAGE_UNITS", "CONTEXT_INT(Const.MaxImageUnits), extra_ARB_shader_image_load_store"],
-  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedImageUnitsAndFragmentOutputs), extra_ARB_shader_image_load_store"],
-  [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store"],
-  [ "MAX_VERTEX_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms), extra_ARB_shader_image_load_store"],
+  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedImageUnitsAndFragmentOutputs), extra_ARB_shader_image_load_store" ],
+  [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store" ],
   [ "MAX_GEOMETRY_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms), extra_ARB_shader_image_load_store_and_geometry_shader"],
-  [ "MAX_FRAGMENT_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_FRAGMENT].MaxImageUniforms), extra_ARB_shader_image_load_store"],
-  [ "MAX_COMBINED_IMAGE_UNIFORMS", "CONTEXT_INT(Const.MaxCombinedImageUniforms), extra_ARB_shader_image_load_store"],
-
-# GL_ARB_compute_shader
-  [ "MAX_COMPUTE_WORK_GROUP_INVOCATIONS", "CONTEXT_INT(Const.MaxComputeWorkGroupInvocations), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_UNIFORM_BLOCKS", "CONST(MAX_COMPUTE_UNIFORM_BLOCKS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_TEXTURE_IMAGE_UNITS", "CONST(MAX_COMPUTE_TEXTURE_IMAGE_UNITS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_ATOMIC_COUNTERS", "CONST(MAX_COMPUTE_ATOMIC_COUNTERS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_SHARED_MEMORY_SIZE", "CONST(MAX_COMPUTE_SHARED_MEMORY_SIZE), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader" ],
-  [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader" ],
 
 # GL_ARB_framebuffer_no_attachments
   ["MAX_FRAMEBUFFER_WIDTH", "CONTEXT_INT(Const.MaxFramebufferWidth), extra_ARB_framebuffer_no_attachments"],
@@ -826,6 +837,38 @@ descriptor=[
   [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
   [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
   [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
+
+# GL_ARB_tessellation_shader
+  [ "PATCH_VERTICES", "CONTEXT_INT(TessCtrlProgram.patch_vertices), extra_ARB_tessellation_shader" ],
+  [ "PATCH_DEFAULT_OUTER_LEVEL", "CONTEXT_FLOAT4(TessCtrlProgram.patch_default_outer_level), extra_ARB_tessellation_shader" ],
+  [ "PATCH_DEFAULT_INNER_LEVEL", "CONTEXT_FLOAT2(TessCtrlProgram.patch_default_inner_level), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_GEN_LEVEL", "CONTEXT_INT(Const.MaxTessGenLevel), extra_ARB_tessellation_shader" ],
+  [ "MAX_PATCH_VERTICES", "CONTEXT_INT(Const.MaxPatchVertices), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_PATCH_COMPONENTS", "CONTEXT_INT(Const.MaxTessPatchComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_TOTAL_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.MaxTessControlTotalOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_OUTPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxOutputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_INPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxInputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_INPUT_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxInputComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_CONTROL_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks), extra_ARB_tessellation_shader" ],
+  [ "MAX_TESS_EVALUATION_UNIFORM_BLOCKS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformBlocks), extra_ARB_tessellation_shader" ],
+  [ "MAX_COMBINED_TESS_CONTROL_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxCombinedUniformComponents), extra_ARB_tessellation_shader" ],
+  [ "MAX_COMBINED_TESS_EVALUATION_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxCombinedUniformComponents), extra_ARB_tessellation_shader" ],
+# Dependencies on GL_ARB_tessellation_shader
+  [ "MAX_TESS_CONTROL_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_CONTROL_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_EVALUATION_ATOMIC_COUNTER_BUFFERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicBuffers), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_EVALUATION_ATOMIC_COUNTERS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxAtomicCounters), extra_ARB_shader_atomic_counters_and_tessellation" ],
+  [ "MAX_TESS_CONTROL_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_CTRL].MaxImageUniforms), extra_ARB_shader_image_load_store_and_tessellation"],
+  [ "MAX_TESS_EVALUATION_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_TESS_EVAL].MaxImageUniforms), extra_ARB_shader_image_load_store_and_tessellation"],
+
+# GL_ARB_shader_subroutine
+  [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), extra_ARB_shader_subroutine" ],
+  [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), extra_ARB_shader_subroutine" ],
 ]}
 
 ]
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 72d99ca4e22..9873fdbf1a4 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -208,7 +208,7 @@ _mesa_GetPointerv( GLenum pname, GLvoid **params )
       return;
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetPointerv %s\n", _mesa_lookup_enum_by_nr(pname));
+      _mesa_debug(ctx, "glGetPointerv %s\n", _mesa_enum_to_string(pname));
 
    switch (pname) {
       case GL_VERTEX_ARRAY_POINTER:
@@ -299,7 +299,7 @@ _mesa_GetError( void )
    ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glGetError <-- %s\n", _mesa_lookup_enum_by_nr(e));
+      _mesa_debug(ctx, "glGetError <-- %s\n", _mesa_enum_to_string(e));
 
    ctx->ErrorValue = (GLenum) GL_NO_ERROR;
    ctx->ErrorDebugCount = 0;
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index ac69fabccaa..3eb66dab7f8 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -186,7 +186,7 @@ get_map_idx(GLenum value)
       return IDX_RG;
    default:
       _mesa_problem(NULL, "Unexpected inFormat %s",
-                    _mesa_lookup_enum_by_nr(value));
+                    _mesa_enum_to_string(value));
       return 0;
    }
 }
@@ -216,8 +216,8 @@ _mesa_compute_component_mapping(GLenum inFormat, GLenum outFormat, GLubyte *map)
 
 #if 0
    printf("from %x/%s to %x/%s map %d %d %d %d %d %d\n",
-	  inFormat, _mesa_lookup_enum_by_nr(inFormat),
-	  outFormat, _mesa_lookup_enum_by_nr(outFormat),
+	  inFormat, _mesa_enum_to_string(inFormat),
+	  outFormat, _mesa_enum_to_string(outFormat),
 	  map[0],
 	  map[1],
 	  map[2],
@@ -1278,9 +1278,53 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format)
    }
 }
 
+/**
+ * Convert various unpack formats to the corresponding base format.
+ */
+GLenum
+_mesa_unpack_format_to_base_format(GLenum format)
+{
+   switch(format) {
+   case GL_RED_INTEGER:
+      return GL_RED;
+   case GL_GREEN_INTEGER:
+      return GL_GREEN;
+   case GL_BLUE_INTEGER:
+      return GL_BLUE;
+   case GL_ALPHA_INTEGER:
+      return GL_ALPHA;
+   case GL_RG_INTEGER:
+      return GL_RG;
+   case GL_RGB_INTEGER:
+      return GL_RGB;
+   case GL_RGBA_INTEGER:
+      return GL_RGBA;
+   case GL_BGR_INTEGER:
+      return GL_BGR;
+   case GL_BGRA_INTEGER:
+      return GL_BGRA;
+   case GL_LUMINANCE_INTEGER_EXT:
+      return GL_LUMINANCE;
+   case GL_LUMINANCE_ALPHA_INTEGER_EXT:
+      return GL_LUMINANCE_ALPHA;
+   case GL_RED:
+   case GL_GREEN:
+   case GL_BLUE:
+   case GL_RG:
+   case GL_RGB:
+   case GL_RGBA:
+   case GL_BGR:
+   case GL_BGRA:
+   case GL_ALPHA:
+   case GL_LUMINANCE:
+   case GL_LUMINANCE_ALPHA:
+   default:
+      return format;
+   }
+}
 
 /**
- * Convert various base formats to the cooresponding integer format.
+ * Convert various base formats to the corresponding integer format.
  */
 GLenum
 _mesa_base_format_to_integer_format(GLenum format)
@@ -2605,8 +2649,6 @@ get_swizzle_from_gl_format(GLenum format, uint8_t *swizzle)
 uint32_t
 _mesa_format_from_format_and_type(GLenum format, GLenum type)
 {
-   mesa_array_format array_format;
-
    bool is_array_format = true;
    uint8_t swizzle[4];
    bool normalized = false, is_float = false, is_signed = false;
@@ -2662,15 +2704,9 @@ _mesa_format_from_format_and_type(GLenum format, GLenum type)
       normalized = !_mesa_is_enum_format_integer(format);
       num_channels = _mesa_components_in_format(format);
 
-      array_format =
-         MESA_ARRAY_FORMAT(type_size, is_signed, is_float,
-                           normalized, num_channels,
-                           swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-
-      if (!_mesa_little_endian())
-         array_format = _mesa_array_format_flip_channels(array_format);
-
-      return array_format;
+      return MESA_ARRAY_FORMAT(type_size, is_signed, is_float,
+                               normalized, num_channels,
+                               swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
    }
 
    /* Otherwise this is not an array format, so return the mesa_format
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index 8881cb7d86b..419955a6033 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -101,6 +101,9 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format);
 extern GLenum
 _mesa_base_format_to_integer_format(GLenum format);
 
+extern GLenum
+_mesa_unpack_format_to_base_format(GLenum format);
+
 extern GLboolean
 _mesa_base_format_has_channel(GLenum base_format, GLenum pname);
 
diff --git a/src/mesa/main/hint.c b/src/mesa/main/hint.c
index 3e056ebaf13..984239a7276 100644
--- a/src/mesa/main/hint.c
+++ b/src/mesa/main/hint.c
@@ -40,8 +40,8 @@ _mesa_Hint( GLenum target, GLenum mode )
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glHint %s %s\n",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(mode));
 
    if (mode != GL_NICEST && mode != GL_FASTEST && mode != GL_DONT_CARE) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glHint(mode)");
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index 68c7316575c..350e6752c8b 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -369,7 +369,7 @@ _mesa_float_to_half(float val)
           * or normal.
           */
          e = 0;
-         m = (int) _mesa_roundevenf((1 << 24) * fabsf(fi.f));
+         m = _mesa_lroundevenf((1 << 24) * fabsf(fi.f));
       }
       else if (new_exp > 15) {
          /* map this value to infinity */
@@ -383,7 +383,7 @@ _mesa_float_to_half(float val)
           * either normal or infinite.
           */
          e = new_exp + 15;
-         m = (int) _mesa_roundevenf(flt_m / (float) (1 << 13));
+         m = _mesa_lroundevenf(flt_m / (float) (1 << 13));
       }
    }
 
diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h
index 9ffe3decd0f..d61279ac4e5 100644
--- a/src/mesa/main/imports.h
+++ b/src/mesa/main/imports.h
@@ -170,34 +170,6 @@ static inline int IROUND_POS(float f)
    return (int) (f + 0.5F);
 }
 
-#ifdef __x86_64__
-#  include <xmmintrin.h>
-#endif
-
-/**
- * Convert float to int using a fast method.  The rounding mode may vary.
- */
-static inline int F_TO_I(float f)
-{
-#if defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__)
-   int r;
-   __asm__ ("fistpl %0" : "=m" (r) : "t" (f) : "st");
-   return r;
-#elif defined(USE_X86_ASM) && defined(_MSC_VER)
-   int r;
-   _asm {
-	 fld f
-	 fistp r
-	}
-   return r;
-#elif defined(__x86_64__)
-   return _mm_cvt_ss2si(_mm_load_ss(&f));
-#else
-   return IROUND(f);
-#endif
-}
-
-
 /** Return (as an integer) floor of float */
 static inline int IFLOOR(float f)
 {
diff --git a/src/mesa/main/light.c b/src/mesa/main/light.c
index 4021dbef922..14b4b04162b 100644
--- a/src/mesa/main/light.c
+++ b/src/mesa/main/light.c
@@ -42,16 +42,16 @@ _mesa_ShadeModel( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glShadeModel %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glShadeModel %s\n", _mesa_enum_to_string(mode));
+
+   if (ctx->Light.ShadeModel == mode)
+      return;
 
    if (mode != GL_FLAT && mode != GL_SMOOTH) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glShadeModel");
       return;
    }
 
-   if (ctx->Light.ShadeModel == mode)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_LIGHT);
    ctx->Light.ShadeModel = mode;
 
@@ -143,7 +143,7 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
       COPY_3V(light->SpotDirection, params);
       break;
    case GL_SPOT_EXPONENT:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       assert(params[0] <= ctx->Const.MaxSpotExponent);
       if (light->SpotExponent == params[0])
 	 return;
@@ -151,12 +151,12 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
       light->SpotExponent = params[0];
       break;
    case GL_SPOT_CUTOFF:
-      assert(params[0] == 180.0 || (params[0] >= 0.0 && params[0] <= 90.0));
+      assert(params[0] == 180.0F || (params[0] >= 0.0F && params[0] <= 90.0F));
       if (light->SpotCutoff == params[0])
          return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->SpotCutoff = params[0];
-      light->_CosCutoff = (GLfloat) (cos(light->SpotCutoff * M_PI / 180.0));
+      light->_CosCutoff = (cosf(light->SpotCutoff * M_PI / 180.0));
       if (light->_CosCutoff < 0)
          light->_CosCutoff = 0;
       if (light->SpotCutoff != 180.0F)
@@ -165,21 +165,21 @@ _mesa_light(struct gl_context *ctx, GLuint lnum, GLenum pname, const GLfloat *pa
          light->_Flags &= ~LIGHT_SPOT;
       break;
    case GL_CONSTANT_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->ConstantAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->ConstantAttenuation = params[0];
       break;
    case GL_LINEAR_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->LinearAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       light->LinearAttenuation = params[0];
       break;
    case GL_QUADRATIC_ATTENUATION:
-      assert(params[0] >= 0.0);
+      assert(params[0] >= 0.0F);
       if (light->QuadraticAttenuation == params[0])
 	 return;
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
@@ -238,31 +238,31 @@ _mesa_Lightfv( GLenum light, GLenum pname, const GLfloat *params )
       params = temp;
       break;
    case GL_SPOT_EXPONENT:
-      if (params[0] < 0.0 || params[0] > ctx->Const.MaxSpotExponent) {
+      if (params[0] < 0.0F || params[0] > ctx->Const.MaxSpotExponent) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_SPOT_CUTOFF:
-      if ((params[0] < 0.0 || params[0] > 90.0) && params[0] != 180.0) {
+      if ((params[0] < 0.0F || params[0] > 90.0F) && params[0] != 180.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_CONSTANT_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_LINEAR_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
       break;
    case GL_QUADRATIC_ATTENUATION:
-      if (params[0] < 0.0) {
+      if (params[0] < 0.0F) {
 	 _mesa_error(ctx, GL_INVALID_VALUE, "glLight");
 	 return;
       }
@@ -463,14 +463,14 @@ _mesa_LightModelfv( GLenum pname, const GLfloat *params )
       case GL_LIGHT_MODEL_LOCAL_VIEWER:
          if (ctx->API != API_OPENGL_COMPAT)
             goto invalid_pname;
-         newbool = (params[0]!=0.0);
+         newbool = (params[0] != 0.0F);
 	 if (ctx->Light.Model.LocalViewer == newbool)
 	    return;
 	 FLUSH_VERTICES(ctx, _NEW_LIGHT);
 	 ctx->Light.Model.LocalViewer = newbool;
          break;
       case GL_LIGHT_MODEL_TWO_SIDE:
-         newbool = (params[0]!=0.0);
+         newbool = (params[0] != 0.0F);
 	 if (ctx->Light.Model.TwoSide == newbool)
 	    return;
 	 FLUSH_VERTICES(ctx, _NEW_LIGHT);
@@ -723,8 +723,8 @@ _mesa_ColorMaterial( GLenum face, GLenum mode )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glColorMaterial %s %s\n",
-                  _mesa_lookup_enum_by_nr(face),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(face),
+                  _mesa_enum_to_string(mode));
 
    bitmask = _mesa_material_bitmask(ctx, face, mode, legal, "glColorMaterial");
    if (bitmask == 0)
@@ -975,7 +975,7 @@ compute_light_positions( struct gl_context *ctx )
       }
       else {
          /* positional light w/ homogeneous coordinate, divide by W */
-         GLfloat wInv = (GLfloat)1.0 / light->_Position[3];
+         GLfloat wInv = 1.0F / light->_Position[3];
          light->_Position[0] *= wInv;
          light->_Position[1] *= wInv;
          light->_Position[2] *= wInv;
@@ -1024,7 +1024,7 @@ update_modelview_scale( struct gl_context *ctx )
    if (!_math_matrix_is_length_preserving(ctx->ModelviewMatrixStack.Top)) {
       const GLfloat *m = ctx->ModelviewMatrixStack.Top->inv;
       GLfloat f = m[2] * m[2] + m[6] * m[6] + m[10] * m[10];
-      if (f < 1e-12) f = 1.0;
+      if (f < 1e-12f) f = 1.0f;
       if (ctx->_NeedEyeCoords)
 	 ctx->_ModelViewInvScale = 1.0f / sqrtf(f);
       else
diff --git a/src/mesa/main/lines.c b/src/mesa/main/lines.c
index 3c08ed2e713..c020fb3eb9e 100644
--- a/src/mesa/main/lines.c
+++ b/src/mesa/main/lines.c
@@ -45,7 +45,7 @@ _mesa_LineWidth( GLfloat width )
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glLineWidth %f\n", width);
 
-   if (width<=0.0) {
+   if (width <= 0.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glLineWidth" );
       return;
    }
@@ -63,7 +63,7 @@ _mesa_LineWidth( GLfloat width )
    if (ctx->API == API_OPENGL_CORE
        && ((ctx->Const.ContextFlags & GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT)
            != 0)
-       && width > 1.0) {
+       && width > 1.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glLineWidth" );
       return;
    }
diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h
index 0608650aeb4..54df50c9cfe 100644
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -33,6 +33,7 @@
 
 #include "util/macros.h"
 #include "util/u_math.h"
+#include "util/rounding.h"
 #include "imports.h"
 
 
@@ -131,12 +132,12 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
 #define INT_TO_USHORT(i)   ((i) < 0 ? 0 : ((GLushort) ((i) >> 15)))
 #define UINT_TO_USHORT(i)  ((i) < 0 ? 0 : ((GLushort) ((i) >> 16)))
 #define UNCLAMPED_FLOAT_TO_USHORT(us, f)  \
-        us = ( (GLushort) F_TO_I( CLAMP((f), 0.0F, 1.0F) * 65535.0F) )
+        us = ( (GLushort) _mesa_lroundevenf( CLAMP((f), 0.0F, 1.0F) * 65535.0F) )
 #define CLAMPED_FLOAT_TO_USHORT(us, f)  \
-        us = ( (GLushort) F_TO_I( (f) * 65535.0F) )
+        us = ( (GLushort) _mesa_lroundevenf( (f) * 65535.0F) )
 
 #define UNCLAMPED_FLOAT_TO_SHORT(s, f)  \
-        s = ( (GLshort) F_TO_I( CLAMP((f), -1.0F, 1.0F) * 32767.0F) )
+        s = ( (GLshort) _mesa_lroundevenf( CLAMP((f), -1.0F, 1.0F) * 32767.0F) )
 
 /***
  *** UNCLAMPED_FLOAT_TO_UBYTE: clamp float to [0,1] and map to ubyte in [0,255]
@@ -167,9 +168,9 @@ extern GLfloat _mesa_ubyte_to_float_color_tab[256];
         } while (0)
 #else
 #define UNCLAMPED_FLOAT_TO_UBYTE(ub, f) \
-	ub = ((GLubyte) F_TO_I(CLAMP((f), 0.0F, 1.0F) * 255.0F))
+	ub = ((GLubyte) _mesa_lroundevenf(CLAMP((f), 0.0F, 1.0F) * 255.0F))
 #define CLAMPED_FLOAT_TO_UBYTE(ub, f) \
-	ub = ((GLubyte) F_TO_I((f) * 255.0F))
+	ub = ((GLubyte) _mesa_lroundevenf((f) * 255.0F))
 #endif
 
 static fi_type UINT_AS_UNION(GLuint u)
@@ -678,17 +679,6 @@ minify(unsigned value, unsigned levels)
     return MAX2(1, value >> levels);
 }
 
-/**
- * Return true if the given value is a power of two.
- *
- * Note that this considers 0 a power of two.
- */
-static inline bool
-is_power_of_two(unsigned value)
-{
-   return (value & (value - 1)) == 0;
-}
-
 /**
  * Align a value up to an alignment value
  *
diff --git a/src/mesa/main/matrix.c b/src/mesa/main/matrix.c
index 80c8a248ce4..2b8016a4a72 100644
--- a/src/mesa/main/matrix.c
+++ b/src/mesa/main/matrix.c
@@ -229,7 +229,7 @@ _mesa_PushMatrix( void )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPushMatrix %s\n",
-                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                  _mesa_enum_to_string(ctx->Transform.MatrixMode));
 
    if (stack->Depth + 1 >= stack->MaxDepth) {
       if (ctx->Transform.MatrixMode == GL_TEXTURE) {
@@ -239,7 +239,7 @@ _mesa_PushMatrix( void )
       }
       else {
          _mesa_error(ctx,  GL_STACK_OVERFLOW, "glPushMatrix(mode=%s)",
-                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                     _mesa_enum_to_string(ctx->Transform.MatrixMode));
       }
       return;
    }
@@ -270,7 +270,7 @@ _mesa_PopMatrix( void )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPopMatrix %s\n",
-                  _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                  _mesa_enum_to_string(ctx->Transform.MatrixMode));
 
    if (stack->Depth == 0) {
       if (ctx->Transform.MatrixMode == GL_TEXTURE) {
@@ -280,7 +280,7 @@ _mesa_PopMatrix( void )
       }
       else {
          _mesa_error(ctx,  GL_STACK_UNDERFLOW, "glPopMatrix(mode=%s)",
-                     _mesa_lookup_enum_by_nr(ctx->Transform.MatrixMode));
+                     _mesa_enum_to_string(ctx->Transform.MatrixMode));
       }
       return;
    }
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 7732d09b2ec..1e22f930092 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -2077,9 +2077,12 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
 
       /* Get the uncompressed image */
       assert(srcImage->Level == texObj->BaseLevel);
-      ctx->Driver.GetTexImage(ctx,
-                              temp_base_format, temp_datatype,
-                              temp_src, srcImage);
+      ctx->Driver.GetTexSubImage(ctx,
+                                 0, 0, 0,
+                                 srcImage->Width, srcImage->Height,
+                                 srcImage->Depth,
+                                 temp_base_format, temp_datatype,
+                                 temp_src, srcImage);
       /* restore packing mode */
       ctx->Pack = save;
    }
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 2d285b87a78..83f3717754d 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -90,7 +90,7 @@ struct vbo_context;
 
 
 /** Extra draw modes beyond GL_POINTS, GL_TRIANGLE_FAN, etc */
-#define PRIM_MAX                 GL_TRIANGLE_STRIP_ADJACENCY
+#define PRIM_MAX                 GL_PATCHES
 #define PRIM_OUTSIDE_BEGIN_END   (PRIM_MAX + 1)
 #define PRIM_UNKNOWN             (PRIM_MAX + 2)
 
@@ -109,6 +109,8 @@ _mesa_varying_slot_in_fs(gl_varying_slot slot)
    case VARYING_SLOT_EDGE:
    case VARYING_SLOT_CLIP_VERTEX:
    case VARYING_SLOT_LAYER:
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+   case VARYING_SLOT_TESS_LEVEL_INNER:
       return GL_FALSE;
    default:
       return GL_TRUE;
@@ -1254,6 +1256,7 @@ typedef enum {
    USAGE_UNIFORM_BUFFER = 0x1,
    USAGE_TEXTURE_BUFFER = 0x2,
    USAGE_ATOMIC_COUNTER_BUFFER = 0x4,
+   USAGE_SHADER_STORAGE_BUFFER = 0x8,
 } gl_buffer_usage;
 
 
@@ -1654,6 +1657,11 @@ struct gl_transform_feedback_info
     * multiple transform feedback outputs in the same buffer.
     */
    unsigned BufferStride[MAX_FEEDBACK_BUFFERS];
+
+   /**
+    * Which transform feedback stream this buffer binding is associated with.
+    */
+   unsigned BufferStream[MAX_FEEDBACK_BUFFERS];
 };
 
 
@@ -1891,6 +1899,8 @@ struct gl_program
    GLbitfield64 InputsRead;     /**< Bitmask of which input regs are read */
    GLbitfield64 DoubleInputsRead;     /**< Bitmask of which input regs are read  and are doubles */
    GLbitfield64 OutputsWritten; /**< Bitmask of which output regs are written */
+   GLbitfield PatchInputsRead;  /**< VAR[0..31] usage for patch inputs (user-defined only) */
+   GLbitfield PatchOutputsWritten; /**< VAR[0..31] usage for patch outputs (user-defined only) */
    GLbitfield SystemValuesRead;   /**< Bitmask of SYSTEM_VALUE_x inputs used */
    GLbitfield TexturesUsed[MAX_COMBINED_TEXTURE_IMAGE_UNITS];  /**< TEXTURE_x_BIT bitmask */
    GLbitfield SamplersUsed;   /**< Bitfield of which samplers are used */
@@ -1958,6 +1968,29 @@ struct gl_vertex_program
 };
 
 
+/** Tessellation control program object */
+struct gl_tess_ctrl_program
+{
+   struct gl_program Base;   /**< base class */
+
+   /* output layout */
+   GLint VerticesOut;
+};
+
+
+/** Tessellation evaluation program object */
+struct gl_tess_eval_program
+{
+   struct gl_program Base;   /**< base class */
+
+   /* input layout */
+   GLenum PrimitiveMode; /* GL_TRIANGLES, GL_QUADS or GL_ISOLINES */
+   GLenum Spacing;       /* GL_EQUAL, GL_FRACTIONAL_EVEN, GL_FRACTIONAL_ODD */
+   GLenum VertexOrder;   /* GL_CW or GL_CCW */
+   bool PointMode;
+};
+
+
 /** Geometry program object */
 struct gl_geometry_program
 {
@@ -2060,6 +2093,27 @@ struct gl_vertex_program_state
    GLboolean _Overriden;
 };
 
+/**
+ * Context state for tessellation control programs.
+ */
+struct gl_tess_ctrl_program_state
+{
+   /** Currently bound and valid shader. */
+   struct gl_tess_ctrl_program *_Current;
+
+   GLint patch_vertices;
+   GLfloat patch_default_outer_level[4];
+   GLfloat patch_default_inner_level[2];
+};
+
+/**
+ * Context state for tessellation evaluation programs.
+ */
+struct gl_tess_eval_program_state
+{
+   /** Currently bound and valid shader. */
+   struct gl_tess_eval_program *_Current;
+};
 
 /**
  * Context state for geometry programs.
@@ -2154,13 +2208,23 @@ struct gl_ati_fragment_shader_state
    struct ati_fragment_shader *Current;
 };
 
+/**
+ *  Shader subroutine function definition
+ */
+struct gl_subroutine_function
+{
+   char *name;
+   int num_compat_types;
+   const struct glsl_type **types;
+};
 
 /**
  * A GLSL vertex or fragment shader object.
  */
 struct gl_shader
 {
-   /** GL_FRAGMENT_SHADER || GL_VERTEX_SHADER || GL_GEOMETRY_SHADER_ARB.
+   /** GL_FRAGMENT_SHADER || GL_VERTEX_SHADER || GL_GEOMETRY_SHADER_ARB ||
+    *  GL_TESS_CONTROL_SHADER || GL_TESS_EVALUATION_SHADER.
     * Must be the first field.
     */
    GLenum Type;
@@ -2239,6 +2303,41 @@ struct gl_shader
    bool origin_upper_left;
    bool pixel_center_integer;
 
+   /**
+    * Tessellation Control shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * 0 - vertices not declared in shader, or
+       * 1 .. GL_MAX_PATCH_VERTICES
+       */
+      GLint VerticesOut;
+   } TessCtrl;
+
+   /**
+    * Tessellation Evaluation shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * GL_TRIANGLES, GL_QUADS, GL_ISOLINES or PRIM_UNKNOWN if it's not set
+       * in this shader.
+       */
+      GLenum PrimitiveMode;
+      /**
+       * GL_EQUAL, GL_FRACTIONAL_ODD, GL_FRACTIONAL_EVEN, or 0 if it's not set
+       * in this shader.
+       */
+      GLenum Spacing;
+      /**
+       * GL_CW, GL_CCW, or 0 if it's not set in this shader.
+       */
+      GLenum VertexOrder;
+      /**
+       * 1, 0, or -1 if it's not set in this shader.
+       */
+      int PointMode;
+   } TessEval;
+
    /**
     * Geometry shader state from GLSL 1.50 layout qualifiers.
     */
@@ -2304,6 +2403,25 @@ struct gl_shader
        */
       unsigned LocalSize[3];
    } Comp;
+
+   /**
+     * Number of types for subroutine uniforms.
+     */
+   GLuint NumSubroutineUniformTypes;
+
+   /**
+     * Subroutine uniform remap table
+     * based on the program level uniform remap table.
+     */
+   GLuint NumSubroutineUniformRemapTable;
+   struct gl_uniform_storage **SubroutineUniformRemapTable;
+
+   /**
+    * Num of subroutine functions for this stage
+    * and storage for them.
+    */
+   GLuint NumSubroutineFunctions;
+   struct gl_subroutine_function *SubroutineFunctions;
 };
 
 
@@ -2364,6 +2482,11 @@ struct gl_uniform_block
     */
    GLuint UniformBufferSize;
 
+   /**
+    * Is this actually an interface block for a shader storage buffer?
+    */
+   bool IsShaderStorage;
+
    /**
     * Layout specified in the shader
     *
@@ -2467,6 +2590,37 @@ struct gl_shader_program
    /** Post-link gl_FragDepth layout for ARB_conservative_depth. */
    enum gl_frag_depth_layout FragDepthLayout;
 
+   /**
+    * Tessellation Control shader state from layout qualifiers.
+    */
+   struct {
+      /**
+       * 0 - vertices not declared in shader, or
+       * 1 .. GL_MAX_PATCH_VERTICES
+       */
+      GLint VerticesOut;
+   } TessCtrl;
+
+   /**
+    * Tessellation Evaluation shader state from layout qualifiers.
+    */
+   struct {
+      /** GL_TRIANGLES, GL_QUADS or GL_ISOLINES */
+      GLenum PrimitiveMode;
+      /** GL_EQUAL, GL_FRACTIONAL_ODD or GL_FRACTIONAL_EVEN */
+      GLenum Spacing;
+      /** GL_CW or GL_CCW */
+      GLenum VertexOrder;
+      bool PointMode;
+      /**
+       * True if gl_ClipDistance is written to.  Copied into
+       * gl_tess_eval_program by _mesa_copy_linked_program_data().
+       */
+      GLboolean UsesClipDistance;
+      GLuint ClipDistanceArraySize; /**< Size of the gl_ClipDistance array, or
+                                         0 if not present. */
+   } TessEval;
+
    /**
     * Geometry shader state - copied into gl_geometry_program by
     * _mesa_copy_linked_program_data().
@@ -2681,6 +2835,7 @@ struct gl_shader_compiler_options
    GLboolean EmitNoIndirectOutput;  /**< No indirect addressing of outputs */
    GLboolean EmitNoIndirectTemp;    /**< No indirect addressing of temps */
    GLboolean EmitNoIndirectUniform; /**< No indirect addressing of constants */
+   GLboolean EmitNoIndirectSampler; /**< No indirect addressing of samplers */
    /*@}*/
 
    GLuint MaxIfDepth;               /**< Maximum nested IF blocks */
@@ -3100,6 +3255,9 @@ struct gl_program_constants
 
    /* GL_ARB_shader_image_load_store */
    GLuint MaxImageUniforms;
+
+   /* GL_ARB_shader_storage_buffer_object */
+   GLuint MaxShaderStorageBlocks;
 };
 
 
@@ -3197,6 +3355,15 @@ struct gl_constants
    GLuint UniformBufferOffsetAlignment;
    /** @} */
 
+   /** @{
+    * GL_ARB_shader_storage_buffer_object
+    */
+   GLuint MaxCombinedShaderStorageBlocks;
+   GLuint MaxShaderStorageBufferBindings;
+   GLuint MaxShaderStorageBlockSize;
+   GLuint ShaderStorageBufferOffsetAlignment;
+   /** @} */
+
    /**
     * GL_ARB_explicit_uniform_location
     */
@@ -3423,6 +3590,13 @@ struct gl_constants
    GLenum ContextReleaseBehavior;
 
    struct gl_shader_compiler_options ShaderCompilerOptions[MESA_SHADER_STAGES];
+
+   /** GL_ARB_tessellation_shader */
+   GLuint MaxPatchVertices;
+   GLuint MaxTessGenLevel;
+   GLuint MaxTessPatchComponents;
+   GLuint MaxTessControlTotalOutputComponents;
+   bool LowerTessLevel; /**< Lower gl_TessLevel* from float[n] to vecn? */
 };
 
 
@@ -3484,6 +3658,8 @@ struct gl_extensions
    GLboolean ARB_shader_image_load_store;
    GLboolean ARB_shader_precision;
    GLboolean ARB_shader_stencil_export;
+   GLboolean ARB_shader_storage_buffer_object;
+   GLboolean ARB_shader_subroutine;
    GLboolean ARB_shader_texture_lod;
    GLboolean ARB_shading_language_packing;
    GLboolean ARB_shading_language_420pack;
@@ -3815,6 +3991,12 @@ struct gl_driver_flags
     */
    uint64_t NewUniformBuffer;
 
+   /**
+    * gl_context::ShaderStorageBufferBindings
+    * gl_shader_program::ShaderStorageBlocks
+    */
+   uint64_t NewShaderStorageBuffer;
+
    uint64_t NewTextureBuffer;
 
    /**
@@ -3826,6 +4008,11 @@ struct gl_driver_flags
     * gl_context::ImageUnits
     */
    uint64_t NewImageUnits;
+
+   /**
+    * gl_context::TessCtrlProgram::patch_default_*
+    */
+   uint64_t NewDefaultTessLevels;
 };
 
 struct gl_uniform_buffer_binding
@@ -3842,6 +4029,20 @@ struct gl_uniform_buffer_binding
    GLboolean AutomaticSize;
 };
 
+struct gl_shader_storage_buffer_binding
+{
+   struct gl_buffer_object *BufferObject;
+   /** Start of shader storage block data in the buffer */
+   GLintptr Offset;
+   /** Size of data allowed to be referenced from the buffer (in bytes) */
+   GLsizeiptr Size;
+   /**
+    * glBindBufferBase() indicates that the Size should be ignored and only
+    * limited by the current size of the BufferObject.
+    */
+   GLboolean AutomaticSize;
+};
+
 /**
  * ARB_shader_image_load_store image unit.
  */
@@ -4047,6 +4248,8 @@ struct gl_context
    struct gl_fragment_program_state FragmentProgram;
    struct gl_geometry_program_state GeometryProgram;
    struct gl_compute_program_state ComputeProgram;
+   struct gl_tess_ctrl_program_state TessCtrlProgram;
+   struct gl_tess_eval_program_state TessEvalProgram;
    struct gl_ati_fragment_shader_state ATIFragmentShader;
 
    struct gl_pipeline_shader_state Pipeline; /**< GLSL pipeline shader object state */
@@ -4088,6 +4291,12 @@ struct gl_context
     */
    struct gl_buffer_object *UniformBuffer;
 
+   /**
+    * Current GL_ARB_shader_storage_buffer_object binding referenced by
+    * GL_SHADER_STORAGE_BUFFER target for glBufferData, glMapBuffer, etc.
+    */
+   struct gl_buffer_object *ShaderStorageBuffer;
+
    /**
     * Array of uniform buffers for GL_ARB_uniform_buffer_object and GL 3.1.
     * This is set up using glBindBufferRange() or glBindBufferBase().  They are
@@ -4097,6 +4306,15 @@ struct gl_context
    struct gl_uniform_buffer_binding
       UniformBufferBindings[MAX_COMBINED_UNIFORM_BUFFERS];
 
+   /**
+    * Array of shader storage buffers for ARB_shader_storage_buffer_object
+    * and GL 4.3. This is set up using glBindBufferRange() or
+    * glBindBufferBase().  They are associated with shader storage blocks by
+    * glShaderStorageBlockBinding()'s state in the shader program.
+    */
+   struct gl_shader_storage_buffer_binding
+      ShaderStorageBufferBindings[MAX_COMBINED_SHADER_STORAGE_BUFFERS];
+
    /**
     * Object currently associated with the GL_ATOMIC_COUNTER_BUFFER
     * target.
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 816837b95bd..09e6154f7ec 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -43,7 +43,7 @@ _mesa_SampleCoverage(GLclampf value, GLboolean invert)
 
    FLUSH_VERTICES(ctx, 0);
 
-   ctx->Multisample.SampleCoverageValue = (GLfloat) CLAMP(value, 0.0, 1.0);
+   ctx->Multisample.SampleCoverageValue = CLAMP(value, 0.0f, 1.0f);
    ctx->Multisample.SampleCoverageInvert = invert;
    ctx->NewState |= _NEW_MULTISAMPLE;
 }
@@ -134,7 +134,7 @@ _mesa_MinSampleShading(GLclampf value)
 
    FLUSH_VERTICES(ctx, 0);
 
-   ctx->Multisample.MinSampleShadingValue = CLAMP(value, 0.0, 1.0);
+   ctx->Multisample.MinSampleShadingValue = CLAMP(value, 0.0f, 1.0f);
    ctx->NewState |= _NEW_MULTISAMPLE;
 }
 
@@ -164,8 +164,11 @@ _mesa_check_sample_count(struct gl_context *ctx, GLenum target,
     *
     *     "If internalformat is a signed or unsigned integer format and samples
     *     is greater than zero, then the error INVALID_OPERATION is generated."
+    *
+    * This restriction is relaxed for OpenGL ES 3.1.
     */
-   if (_mesa_is_gles3(ctx) && _mesa_is_enum_format_integer(internalFormat)
+   if ((ctx->API == API_OPENGLES2 && ctx->Version == 30) &&
+       _mesa_is_enum_format_integer(internalFormat)
        && samples > 0) {
       return GL_INVALID_OPERATION;
    }
diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c
index 5626054687b..1019f893ba8 100644
--- a/src/mesa/main/objectlabel.c
+++ b/src/mesa/main/objectlabel.c
@@ -234,7 +234,7 @@ get_label_pointer(struct gl_context *ctx, GLenum identifier, GLuint name,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(identifier = %s)",
-               caller, _mesa_lookup_enum_by_nr(identifier));
+               caller, _mesa_enum_to_string(identifier));
    return NULL;
 }
 
diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
index f72360817e9..7147fd6e4fe 100644
--- a/src/mesa/main/pack.c
+++ b/src/mesa/main/pack.c
@@ -470,7 +470,7 @@ extract_uint_indexes(GLuint n, GLuint indexes[],
 static inline GLuint
 clamp_float_to_uint(GLfloat f)
 {
-   return f < 0.0F ? 0 : F_TO_I(f);
+   return f < 0.0F ? 0 : _mesa_lroundevenf(f);
 }
 
 
@@ -478,7 +478,7 @@ static inline GLuint
 clamp_half_to_uint(GLhalfARB h)
 {
    GLfloat f = _mesa_half_to_float(h);
-   return f < 0.0F ? 0 : F_TO_I(f);
+   return f < 0.0F ? 0 : _mesa_lroundevenf(f);
 }
 
 
@@ -796,7 +796,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
     * back to an int type can introduce errors that will show up as
     * artifacts in things like depth peeling which uses glCopyTexImage.
     */
-   if (ctx->Pixel.DepthScale == 1.0 && ctx->Pixel.DepthBias == 0.0) {
+   if (ctx->Pixel.DepthScale == 1.0F && ctx->Pixel.DepthBias == 0.0F) {
       if (srcType == GL_UNSIGNED_INT && dstType == GL_UNSIGNED_SHORT) {
          const GLuint *src = (const GLuint *) source;
          GLushort *dst = (GLushort *) dest;
@@ -874,8 +874,8 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
       case GL_UNSIGNED_INT_24_8_EXT: /* GL_EXT_packed_depth_stencil */
          if (dstType == GL_UNSIGNED_INT_24_8_EXT &&
              depthMax == 0xffffff &&
-             ctx->Pixel.DepthScale == 1.0 &&
-             ctx->Pixel.DepthBias == 0.0) {
+             ctx->Pixel.DepthScale == 1.0F &&
+             ctx->Pixel.DepthBias == 0.0F) {
             const GLuint *src = (const GLuint *) source;
             GLuint *zValues = (GLuint *) dest;
             GLuint i;
@@ -945,7 +945,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
    {
       const GLfloat scale = ctx->Pixel.DepthScale;
       const GLfloat bias = ctx->Pixel.DepthBias;
-      if (scale != 1.0 || bias != 0.0) {
+      if (scale != 1.0F || bias != 0.0F) {
          GLuint i;
          for (i = 0; i < n; i++) {
             depthValues[i] = depthValues[i] * scale + bias;
@@ -958,7 +958,7 @@ _mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
    if (needClamp) {
       GLuint i;
       for (i = 0; i < n; i++) {
-         depthValues[i] = (GLfloat)CLAMP(depthValues[i], 0.0, 1.0);
+         depthValues[i] = CLAMP(depthValues[i], 0.0F, 1.0F);
       }
    }
 
@@ -1025,7 +1025,7 @@ _mesa_pack_depth_span( struct gl_context *ctx, GLuint n, GLvoid *dest,
       return;
    }
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0) {
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F) {
       memcpy(depthCopy, depthSpan, n * sizeof(GLfloat));
       _mesa_scale_and_bias_depth(ctx, n, depthCopy);
       depthSpan = depthCopy;
@@ -1153,7 +1153,7 @@ _mesa_pack_depth_stencil_span(struct gl_context *ctx,GLuint n,
       return;
    }
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0) {
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F) {
       memcpy(depthCopy, depthVals, n * sizeof(GLfloat));
       _mesa_scale_and_bias_depth(ctx, n, depthCopy);
       depthVals = depthCopy;
diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 279ae2078fe..07acbf10c1d 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -244,14 +244,13 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
     *
     *     "If stages is not the special value ALL_SHADER_BITS, and has a bit
     *     set that is not recognized, the error INVALID_VALUE is generated."
-    *
-    * NOT YET SUPPORTED:
-    * GL_TESS_CONTROL_SHADER_BIT
-    * GL_TESS_EVALUATION_SHADER_BIT
     */
    any_valid_stages = GL_VERTEX_SHADER_BIT | GL_FRAGMENT_SHADER_BIT;
    if (_mesa_has_geometry_shaders(ctx))
       any_valid_stages |= GL_GEOMETRY_SHADER_BIT;
+   if (_mesa_has_tessellation(ctx))
+      any_valid_stages |= GL_TESS_CONTROL_SHADER_BIT |
+                          GL_TESS_EVALUATION_SHADER_BIT;
 
    if (stages != GL_ALL_SHADER_BITS && (stages & ~any_valid_stages) != 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glUseProgramStages(Stages)");
@@ -327,6 +326,12 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
 
    if ((stages & GL_GEOMETRY_SHADER_BIT) != 0)
       _mesa_use_shader_program(ctx, GL_GEOMETRY_SHADER, shProg, pipe);
+
+   if ((stages & GL_TESS_CONTROL_SHADER_BIT) != 0)
+      _mesa_use_shader_program(ctx, GL_TESS_CONTROL_SHADER, shProg, pipe);
+
+   if ((stages & GL_TESS_EVALUATION_SHADER_BIT) != 0)
+      _mesa_use_shader_program(ctx, GL_TESS_EVALUATION_SHADER, shProg, pipe);
 }
 
 /**
@@ -588,6 +593,7 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
    /* Are geometry shaders available in this context?
     */
    const bool has_gs = _mesa_has_geometry_shaders(ctx);
+   const bool has_tess = _mesa_has_tessellation(ctx);;
 
    if (!pipe) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -615,11 +621,17 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
          ? pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name : 0;
       return;
    case GL_TESS_EVALUATION_SHADER:
-      /* NOT YET SUPPORTED */
-      break;
+      if (!has_tess)
+         break;
+      *params = pipe->CurrentProgram[MESA_SHADER_TESS_EVAL]
+         ? pipe->CurrentProgram[MESA_SHADER_TESS_EVAL]->Name : 0;
+      return;
    case GL_TESS_CONTROL_SHADER:
-      /* NOT YET SUPPORTED */
-      break;
+      if (!has_tess)
+         break;
+      *params = pipe->CurrentProgram[MESA_SHADER_TESS_CTRL]
+         ? pipe->CurrentProgram[MESA_SHADER_TESS_CTRL]->Name : 0;
+      return;
    case GL_GEOMETRY_SHADER:
       if (!has_gs)
          break;
@@ -635,7 +647,7 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
    }
 
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramPipelineiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 /**
@@ -777,7 +789,9 @@ _mesa_validate_program_pipeline(struct gl_context* ctx,
     *           executable vertex shader."
     */
    if (!pipe->CurrentProgram[MESA_SHADER_VERTEX]
-       && pipe->CurrentProgram[MESA_SHADER_GEOMETRY]) {
+       && (pipe->CurrentProgram[MESA_SHADER_GEOMETRY] ||
+           pipe->CurrentProgram[MESA_SHADER_TESS_CTRL] ||
+           pipe->CurrentProgram[MESA_SHADER_TESS_EVAL])) {
       pipe->InfoLog = ralloc_strdup(pipe, "Program lacks a vertex shader");
       goto err;
    }
diff --git a/src/mesa/main/pixel.c b/src/mesa/main/pixel.c
index ecda2694fc8..608a5454702 100644
--- a/src/mesa/main/pixel.c
+++ b/src/mesa/main/pixel.c
@@ -455,12 +455,12 @@ _mesa_GetnPixelMapusvARB( GLenum map, GLsizei bufSize, GLushort *values )
    /* special cases */
    case GL_PIXEL_MAP_I_TO_I:
       for (i = 0; i < mapsize; i++) {
-         values[i] = (GLushort) CLAMP(ctx->PixelMaps.ItoI.Map[i], 0.0, 65535.);
+         values[i] = (GLushort) CLAMP(ctx->PixelMaps.ItoI.Map[i], 0.0F, 65535.0F);
       }
       break;
    case GL_PIXEL_MAP_S_TO_S:
       for (i = 0; i < mapsize; i++) {
-         values[i] = (GLushort) CLAMP(ctx->PixelMaps.StoS.Map[i], 0.0, 65535.);
+         values[i] = (GLushort) CLAMP(ctx->PixelMaps.StoS.Map[i], 0.0F, 65535.0F);
       }
       break;
    default:
diff --git a/src/mesa/main/pixeltransfer.c b/src/mesa/main/pixeltransfer.c
index 94464ea6709..22eac00a7df 100644
--- a/src/mesa/main/pixeltransfer.c
+++ b/src/mesa/main/pixeltransfer.c
@@ -35,6 +35,7 @@
 #include "pixeltransfer.h"
 #include "imports.h"
 #include "mtypes.h"
+#include "util/rounding.h"
 
 
 /*
@@ -47,25 +48,25 @@ _mesa_scale_and_bias_rgba(GLuint n, GLfloat rgba[][4],
                           GLfloat rBias, GLfloat gBias,
                           GLfloat bBias, GLfloat aBias)
 {
-   if (rScale != 1.0 || rBias != 0.0) {
+   if (rScale != 1.0F || rBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][RCOMP] = rgba[i][RCOMP] * rScale + rBias;
       }
    }
-   if (gScale != 1.0 || gBias != 0.0) {
+   if (gScale != 1.0F || gBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][GCOMP] = rgba[i][GCOMP] * gScale + gBias;
       }
    }
-   if (bScale != 1.0 || bBias != 0.0) {
+   if (bScale != 1.0F || bBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][BCOMP] = rgba[i][BCOMP] * bScale + bBias;
       }
    }
-   if (aScale != 1.0 || aBias != 0.0) {
+   if (aScale != 1.0F || aBias != 0.0F) {
       GLuint i;
       for (i = 0; i < n; i++) {
          rgba[i][ACOMP] = rgba[i][ACOMP] * aScale + aBias;
@@ -94,10 +95,10 @@ _mesa_map_rgba( const struct gl_context *ctx, GLuint n, GLfloat rgba[][4] )
       GLfloat g = CLAMP(rgba[i][GCOMP], 0.0F, 1.0F);
       GLfloat b = CLAMP(rgba[i][BCOMP], 0.0F, 1.0F);
       GLfloat a = CLAMP(rgba[i][ACOMP], 0.0F, 1.0F);
-      rgba[i][RCOMP] = rMap[F_TO_I(r * rscale)];
-      rgba[i][GCOMP] = gMap[F_TO_I(g * gscale)];
-      rgba[i][BCOMP] = bMap[F_TO_I(b * bscale)];
-      rgba[i][ACOMP] = aMap[F_TO_I(a * ascale)];
+      rgba[i][RCOMP] = rMap[(int)_mesa_lroundevenf(r * rscale)];
+      rgba[i][GCOMP] = gMap[(int)_mesa_lroundevenf(g * gscale)];
+      rgba[i][BCOMP] = bMap[(int)_mesa_lroundevenf(b * bscale)];
+      rgba[i][ACOMP] = aMap[(int)_mesa_lroundevenf(a * ascale)];
    }
 }
 
@@ -236,7 +237,7 @@ _mesa_apply_ci_transfer_ops(const struct gl_context *ctx,
       GLuint i;
       for (i = 0; i < n; i++) {
          const GLuint j = indexes[i] & mask;
-         indexes[i] = F_TO_I(ctx->PixelMaps.ItoI.Map[j]);
+         indexes[i] = _mesa_lroundevenf(ctx->PixelMaps.ItoI.Map[j]);
       }
    }
 }
diff --git a/src/mesa/main/points.c b/src/mesa/main/points.c
index 5ad1f38f366..863e3c1af32 100644
--- a/src/mesa/main/points.c
+++ b/src/mesa/main/points.c
@@ -45,7 +45,7 @@ _mesa_PointSize( GLfloat size )
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (size <= 0.0) {
+   if (size <= 0.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glPointSize" );
       return;
    }
@@ -119,9 +119,9 @@ _mesa_PointParameterfv( GLenum pname, const GLfloat *params)
             return;
          FLUSH_VERTICES(ctx, _NEW_POINT);
          COPY_3V(ctx->Point.Params, params);
-         ctx->Point._Attenuated = (ctx->Point.Params[0] != 1.0 ||
-                                   ctx->Point.Params[1] != 0.0 ||
-                                   ctx->Point.Params[2] != 0.0);
+         ctx->Point._Attenuated = (ctx->Point.Params[0] != 1.0F ||
+                                   ctx->Point.Params[1] != 0.0F ||
+                                   ctx->Point.Params[2] != 0.0F);
          break;
       case GL_POINT_SIZE_MIN_EXT:
          if (params[0] < 0.0F) {
diff --git a/src/mesa/main/polygon.c b/src/mesa/main/polygon.c
index a1f0aa02da1..60af88f9857 100644
--- a/src/mesa/main/polygon.c
+++ b/src/mesa/main/polygon.c
@@ -56,7 +56,7 @@ _mesa_CullFace( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glCullFace %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glCullFace %s\n", _mesa_enum_to_string(mode));
 
    if (mode!=GL_FRONT && mode!=GL_BACK && mode!=GL_FRONT_AND_BACK) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glCullFace" );
@@ -91,16 +91,16 @@ _mesa_FrontFace( GLenum mode )
    GET_CURRENT_CONTEXT(ctx);
 
    if (MESA_VERBOSE&VERBOSE_API)
-      _mesa_debug(ctx, "glFrontFace %s\n", _mesa_lookup_enum_by_nr(mode));
+      _mesa_debug(ctx, "glFrontFace %s\n", _mesa_enum_to_string(mode));
+
+   if (ctx->Polygon.FrontFace == mode)
+      return;
 
    if (mode!=GL_CW && mode!=GL_CCW) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glFrontFace" );
       return;
    }
 
-   if (ctx->Polygon.FrontFace == mode)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_POLYGON);
    ctx->Polygon.FrontFace = mode;
 
@@ -128,8 +128,8 @@ _mesa_PolygonMode( GLenum face, GLenum mode )
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glPolygonMode %s %s\n",
-                  _mesa_lookup_enum_by_nr(face),
-                  _mesa_lookup_enum_by_nr(mode));
+                  _mesa_enum_to_string(face),
+                  _mesa_enum_to_string(mode));
 
    if (mode!=GL_POINT && mode!=GL_LINE && mode!=GL_FILL) {
       _mesa_error( ctx, GL_INVALID_ENUM, "glPolygonMode(mode)" );
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index d857b84e60d..23d2b4d2da0 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -28,10 +28,11 @@
 #include "main/mtypes.h"
 #include "main/shaderapi.h"
 #include "main/shaderobj.h"
+#include "main/context.h"
 #include "program_resource.h"
-
+#include "ir_uniform.h"
 static bool
-supported_interface_enum(GLenum iface)
+supported_interface_enum(struct gl_context *ctx, GLenum iface)
 {
    switch (iface) {
    case GL_UNIFORM:
@@ -42,17 +43,21 @@ supported_interface_enum(GLenum iface)
    case GL_ATOMIC_COUNTER_BUFFER:
       return true;
    case GL_VERTEX_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      return _mesa_has_shader_subroutine(ctx);
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      return _mesa_has_geometry_shaders(ctx) && _mesa_has_shader_subroutine(ctx);
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      return _mesa_has_compute_shaders(ctx) && _mesa_has_shader_subroutine(ctx);
    case GL_TESS_CONTROL_SUBROUTINE:
    case GL_TESS_EVALUATION_SUBROUTINE:
-   case GL_GEOMETRY_SUBROUTINE:
-   case GL_FRAGMENT_SUBROUTINE:
-   case GL_COMPUTE_SUBROUTINE:
-   case GL_VERTEX_SUBROUTINE_UNIFORM:
    case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
    case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
-   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
-   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      return _mesa_has_tessellation(ctx) && _mesa_has_shader_subroutine(ctx);
    case GL_BUFFER_VARIABLE:
    case GL_SHADER_STORAGE_BLOCK:
    default:
@@ -79,9 +84,9 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
    }
 
    /* Validate interface. */
-   if (!supported_interface_enum(programInterface)) {
+   if (!supported_interface_enum(ctx, programInterface)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramInterfaceiv(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return;
    }
 
@@ -96,8 +101,8 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
       if (programInterface == GL_ATOMIC_COUNTER_BUFFER) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glGetProgramInterfaceiv(%s pname %s)",
-                     _mesa_lookup_enum_by_nr(programInterface),
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(programInterface),
+                     _mesa_enum_to_string(pname));
          return;
       }
       /* Name length consists of base name, 3 additional chars '[0]' if
@@ -138,15 +143,40 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
       default:
         _mesa_error(ctx, GL_INVALID_OPERATION,
                     "glGetProgramInterfaceiv(%s pname %s)",
-                    _mesa_lookup_enum_by_nr(programInterface),
-                    _mesa_lookup_enum_by_nr(pname));
+                    _mesa_enum_to_string(programInterface),
+                    _mesa_enum_to_string(pname));
       };
       break;
    case GL_MAX_NUM_COMPATIBLE_SUBROUTINES:
+      switch (programInterface) {
+      case GL_VERTEX_SUBROUTINE_UNIFORM:
+      case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM: {
+         for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
+            if (shProg->ProgramResourceList[i].Type == programInterface) {
+               struct gl_uniform_storage *uni =
+                  (struct gl_uniform_storage *)
+                  shProg->ProgramResourceList[i].Data;
+               *params = MAX2(*params, uni->num_compatible_subroutines);
+            }
+         }
+         break;
+      }
+
+      default:
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glGetProgramInterfaceiv(%s pname %s)",
+                     _mesa_enum_to_string(programInterface),
+                     _mesa_enum_to_string(pname));
+      }
+      break;
    default:
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glGetProgramInterfaceiv(pname %s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
    }
 }
 
@@ -173,32 +203,12 @@ is_xfb_marker(const char *str)
    return false;
 }
 
-/**
- * Checks if given name index is legal for GetProgramResourceIndex,
- * check is written to be compatible with GL_ARB_array_of_arrays.
- */
-static bool
-valid_program_resource_index_name(const GLchar *name)
-{
-   const char *array = strstr(name, "[");
-   const char *close = strrchr(name, ']');
-
-   /* Not array, no need for the check. */
-   if (!array)
-      return true;
-
-   /* Last array index has to be zero. */
-   if (!close || *--close != '0')
-      return false;
-
-   return true;
-}
-
 GLuint GLAPIENTRY
 _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
                               const GLchar *name)
 {
    GET_CURRENT_CONTEXT(ctx);
+   unsigned array_index = 0;
    struct gl_program_resource *res;
    struct gl_shader_program *shProg =
       _mesa_lookup_shader_program_err(ctx, program,
@@ -206,6 +216,11 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
    if (!shProg || !name)
       return GL_INVALID_INDEX;
 
+   if (!supported_interface_enum(ctx, programInterface)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
+                  _mesa_enum_to_string(programInterface));
+      return GL_INVALID_INDEX;
+   }
    /*
     * For the interface TRANSFORM_FEEDBACK_VARYING, the value INVALID_INDEX
     * should be returned when querying the index assigned to the special names
@@ -217,24 +232,33 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
       return GL_INVALID_INDEX;
 
    switch (programInterface) {
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_VERTEX_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
    case GL_UNIFORM:
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      /* Validate name syntax for array variables */
-      if (!valid_program_resource_index_name(name))
-         return GL_INVALID_INDEX;
-      /* fall-through */
    case GL_UNIFORM_BLOCK:
-      res = _mesa_program_resource_find_name(shProg, programInterface, name);
-      if (!res)
+      res = _mesa_program_resource_find_name(shProg, programInterface, name,
+                                             &array_index);
+      if (!res || array_index > 0)
          return GL_INVALID_INDEX;
 
       return _mesa_program_resource_index(shProg, res);
    case GL_ATOMIC_COUNTER_BUFFER:
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
    }
 
    return GL_INVALID_INDEX;
@@ -250,19 +274,13 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
       _mesa_lookup_shader_program_err(ctx, program,
                                       "glGetProgramResourceName");
 
-   /* Set user friendly return values in case of errors. */
-   if (name)
-      *name = '\0';
-   if (length)
-      *length = 0;
-
    if (!shProg || !name)
       return;
 
    if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
-       !supported_interface_enum(programInterface)) {
+       !supported_interface_enum(ctx, programInterface)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceName(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return;
    }
 
@@ -300,36 +318,6 @@ _mesa_GetProgramResourceiv(GLuint program, GLenum programInterface,
                                 propCount, props, bufSize, length, params);
 }
 
-/**
- * Function verifies syntax of given name for GetProgramResourceLocation
- * and GetProgramResourceLocationIndex for the following cases:
- *
- * "array element portion of a string passed to GetProgramResourceLocation
- * or GetProgramResourceLocationIndex must not have, a "+" sign, extra
- * leading zeroes, or whitespace".
- *
- * Check is written to be compatible with GL_ARB_array_of_arrays.
- */
-static bool
-invalid_array_element_syntax(const GLchar *name)
-{
-   char *first = strchr(name, '[');
-   char *last = strrchr(name, '[');
-
-   if (!first)
-      return false;
-
-   /* No '+' or ' ' allowed anywhere. */
-   if (strchr(first, '+') || strchr(first, ' '))
-      return true;
-
-   /* Check that last array index is 0. */
-   if (last[1] == '0' && last[2] != ']')
-      return true;
-
-   return false;
-}
-
 static struct gl_shader_program *
 lookup_linked_program(GLuint program, const char *caller)
 {
@@ -356,7 +344,7 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocation");
 
-   if (!shProg || !name || invalid_array_element_syntax(name))
+   if (!shProg || !name)
       return -1;
 
    /* Validate programInterface. */
@@ -366,24 +354,33 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface,
    case GL_PROGRAM_OUTPUT:
       break;
 
-   /* For reference valid cases requiring additional extension support:
-    * GL_ARB_shader_subroutine
-    * GL_ARB_tessellation_shader
-    * GL_ARB_compute_shader
-    */
    case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_geometry_shaders(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      if (!_mesa_has_compute_shaders(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
    case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
    case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
-   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
-   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
-   case GL_COMPUTE_SUBROUTINE_UNIFORM:
-
+      if (!_mesa_has_tessellation(ctx) || !_mesa_has_shader_subroutine(ctx))
+         goto fail;
+      break;
    default:
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceLocation(%s %s)",
-                  _mesa_lookup_enum_by_nr(programInterface), name);
+         goto fail;
    }
 
    return _mesa_program_resource_location(shProg, programInterface, name);
+fail:
+   _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceLocation(%s %s)",
+               _mesa_enum_to_string(programInterface), name);
+   return -1;
 }
 
 /**
@@ -397,7 +394,7 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
    struct gl_shader_program *shProg =
       lookup_linked_program(program, "glGetProgramResourceLocationIndex");
 
-   if (!shProg || !name || invalid_array_element_syntax(name))
+   if (!shProg || !name)
       return -1;
 
    /* From the GL_ARB_program_interface_query spec:
@@ -408,7 +405,7 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface,
    if (programInterface != GL_PROGRAM_OUTPUT) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetProgramResourceLocationIndex(%s)",
-                  _mesa_lookup_enum_by_nr(programInterface));
+                  _mesa_enum_to_string(programInterface));
       return -1;
    }
 
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index 5ff1b953231..98366857f62 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -217,7 +217,7 @@ get_query_binding_point(struct gl_context *ctx, GLenum target, GLuint index)
 
    case GL_TESS_CONTROL_SHADER_PATCHES_ARB:
    case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB:
-      if (ctx->Extensions.ARB_tessellation_shader)
+      if (_mesa_has_tessellation(ctx))
          return get_pipe_stats_binding_point(ctx, target);
       else
          return NULL;
@@ -295,7 +295,7 @@ _mesa_CreateQueries(GLenum target, GLsizei n, GLuint *ids)
       break;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glCreateQueries(invalid target = %s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -390,7 +390,7 @@ _mesa_BeginQueryIndexed(GLenum target, GLuint index, GLuint id)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glBeginQueryIndexed(%s, %u, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), index, id);
+                  _mesa_enum_to_string(target), index, id);
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -412,7 +412,7 @@ _mesa_BeginQueryIndexed(GLenum target, GLuint index, GLuint id)
    if (*bindpt) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glBeginQuery{Indexed}(target=%s is active)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -496,7 +496,7 @@ _mesa_EndQueryIndexed(GLenum target, GLuint index)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glEndQueryIndexed(%s, %u)\n",
-                  _mesa_lookup_enum_by_nr(target), index);
+                  _mesa_enum_to_string(target), index);
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -516,8 +516,8 @@ _mesa_EndQueryIndexed(GLenum target, GLuint index)
    if (q && q->Target != target) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glEndQuery(target=%s with active query of target %s)",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(q->Target));
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(q->Target));
       return;
    }
 
@@ -553,7 +553,7 @@ _mesa_QueryCounter(GLuint id, GLenum target)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glQueryCounter(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
 
    /* error checking */
    if (target != GL_TIMESTAMP) {
@@ -628,9 +628,9 @@ _mesa_GetQueryIndexediv(GLenum target, GLuint index, GLenum pname,
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryIndexediv(%s, %u, %s)\n",
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   index,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (!query_error_check_index(ctx, target, index))
       return;
@@ -712,7 +712,7 @@ _mesa_GetQueryIndexediv(GLenum target, GLuint index, GLenum pname,
          default:
             _mesa_problem(ctx,
                           "Unknown target in glGetQueryIndexediv(target = %s)",
-                          _mesa_lookup_enum_by_nr(target));
+                          _mesa_enum_to_string(target));
             *params = 0;
             break;
          }
@@ -740,7 +740,7 @@ _mesa_GetQueryObjectiv(GLuint id, GLenum pname, GLint *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectiv(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -794,7 +794,7 @@ _mesa_GetQueryObjectuiv(GLuint id, GLenum pname, GLuint *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectuiv(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -851,7 +851,7 @@ _mesa_GetQueryObjecti64v(GLuint id, GLenum pname, GLint64EXT *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjecti64v(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
@@ -894,7 +894,7 @@ _mesa_GetQueryObjectui64v(GLuint id, GLenum pname, GLuint64EXT *params)
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glGetQueryObjectui64v(%u, %s)\n", id,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
 
    if (id)
       q = _mesa_lookup_query_object(ctx, id);
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index a3357cd6419..d826ecfc3d5 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -47,28 +47,47 @@
  * Return true if the conversion L=R+G+B is needed.
  */
 GLboolean
-_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
+_mesa_need_rgb_to_luminance_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat)
 {
-   GLenum baseTexFormat = _mesa_get_format_base_format(texFormat);
-
-   return (baseTexFormat == GL_RG ||
-           baseTexFormat == GL_RGB ||
-           baseTexFormat == GL_RGBA) &&
-          (format == GL_LUMINANCE ||
-           format == GL_LUMINANCE_ALPHA ||
-           format == GL_LUMINANCE_INTEGER_EXT ||
-           format == GL_LUMINANCE_ALPHA_INTEGER_EXT);
+   return (srcBaseFormat == GL_RG ||
+           srcBaseFormat == GL_RGB ||
+           srcBaseFormat == GL_RGBA) &&
+          (dstBaseFormat == GL_LUMINANCE ||
+           dstBaseFormat == GL_LUMINANCE_ALPHA);
 }
 
+/**
+ * Return true if the conversion L,I to RGB conversion is needed.
+ */
+GLboolean
+_mesa_need_luminance_to_rgb_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat)
+{
+   return (srcBaseFormat == GL_LUMINANCE ||
+           srcBaseFormat == GL_LUMINANCE_ALPHA ||
+           srcBaseFormat == GL_INTENSITY) &&
+          (dstBaseFormat == GL_GREEN ||
+           dstBaseFormat == GL_BLUE ||
+           dstBaseFormat == GL_RG ||
+           dstBaseFormat == GL_RGB ||
+           dstBaseFormat == GL_BGR ||
+           dstBaseFormat == GL_RGBA ||
+           dstBaseFormat == GL_BGRA);
+}
 
 /**
  * Return transfer op flags for this ReadPixels operation.
  */
-static GLbitfield
-get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
-                            GLenum format, GLenum type, GLboolean uses_blit)
+GLbitfield
+_mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
+                                  mesa_format texFormat,
+                                  GLenum format, GLenum type,
+                                  GLboolean uses_blit)
 {
    GLbitfield transferOps = ctx->_ImageTransferState;
+   GLenum srcBaseFormat = _mesa_get_format_base_format(texFormat);
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    if (format == GL_DEPTH_COMPONENT ||
        format == GL_DEPTH_STENCIL ||
@@ -105,7 +124,7 @@ get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
     * have any effect anyway.
     */
    if (_mesa_get_format_datatype(texFormat) == GL_UNSIGNED_NORMALIZED &&
-       !_mesa_need_rgb_to_luminance_conversion(texFormat, format)) {
+       !_mesa_need_rgb_to_luminance_conversion(srcBaseFormat, dstBaseFormat)) {
       transferOps &= ~IMAGE_CLAMP_BIT;
    }
 
@@ -128,7 +147,7 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 {
    struct gl_renderbuffer *rb =
          _mesa_get_read_renderbuffer_for_format(ctx, format);
-   GLenum srcType;
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    assert(rb);
 
@@ -149,28 +168,14 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 
    default:
       /* Color formats. */
-      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format)) {
-         return GL_TRUE;
-      }
-
-      /* Conversion between signed and unsigned integers needs masking
-       * (it isn't just memcpy). */
-      srcType = _mesa_get_format_datatype(rb->Format);
-
-      if ((srcType == GL_INT &&
-           (type == GL_UNSIGNED_INT ||
-            type == GL_UNSIGNED_SHORT ||
-            type == GL_UNSIGNED_BYTE)) ||
-          (srcType == GL_UNSIGNED_INT &&
-           (type == GL_INT ||
-            type == GL_SHORT ||
-            type == GL_BYTE))) {
+      if (_mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat,
+                                                 dstBaseFormat)) {
          return GL_TRUE;
       }
 
       /* And finally, see if there are any transfer ops. */
-      return get_readpixels_transfer_ops(ctx, rb->Format, format, type,
-                                         uses_blit) != 0;
+      return _mesa_get_readpixels_transfer_ops(ctx, rb->Format, format, type,
+                                               uses_blit) != 0;
    }
    return GL_FALSE;
 }
@@ -263,7 +268,7 @@ read_uint_depth_pixels( struct gl_context *ctx,
    GLubyte *map, *dst;
    int stride, dstStride, j;
 
-   if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0)
+   if (ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F)
       return GL_FALSE;
 
    if (packing->SwapBytes)
@@ -432,18 +437,19 @@ read_rgba_pixels( struct gl_context *ctx,
    uint8_t rebase_swizzle[4];
    struct gl_framebuffer *fb = ctx->ReadBuffer;
    struct gl_renderbuffer *rb = fb->_ColorReadBuffer;
+   GLenum dstBaseFormat = _mesa_unpack_format_to_base_format(format);
 
    if (!rb)
       return;
 
-   transferOps = get_readpixels_transfer_ops(ctx, rb->Format, format, type,
-                                             GL_FALSE);
+   transferOps = _mesa_get_readpixels_transfer_ops(ctx, rb->Format, format,
+                                                   type, GL_FALSE);
    /* Describe the dst format */
    dst_is_integer = _mesa_is_enum_format_integer(format);
    dst_stride = _mesa_image_row_stride(packing, width, format, type);
    dst_format = _mesa_format_from_format_and_type(format, type);
    convert_rgb_to_lum =
-      _mesa_need_rgb_to_luminance_conversion(rb->Format, format);
+      _mesa_need_rgb_to_luminance_conversion(rb->_BaseFormat, dstBaseFormat);
    dst = (GLubyte *) _mesa_image_address2d(packing, pixels, width, height,
                                            format, type, 0, 0);
 
@@ -815,7 +821,7 @@ read_depth_stencil_pixels(struct gl_context *ctx,
                           const struct gl_pixelstore_attrib *packing )
 {
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
+      = ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F;
    const GLboolean stencilTransfer = ctx->Pixel.IndexShift
       || ctx->Pixel.IndexOffset || ctx->Pixel.MapStencilFlag;
    GLubyte *dst;
@@ -910,10 +916,8 @@ read_pixels_es3_error_check(GLenum format, GLenum type,
    const GLenum data_type = _mesa_get_format_datatype(rb->Format);
    GLboolean is_unsigned_int = GL_FALSE;
    GLboolean is_signed_int = GL_FALSE;
-
-   if (!_mesa_is_color_format(internalFormat)) {
-      return GL_INVALID_OPERATION;
-   }
+   GLboolean is_float_depth = (internalFormat == GL_DEPTH_COMPONENT32F) ||
+         (internalFormat == GL_DEPTH32F_STENCIL8);
 
    is_unsigned_int = _mesa_is_enum_format_unsigned_int(internalFormat);
    if (!is_unsigned_int) {
@@ -944,6 +948,43 @@ read_pixels_es3_error_check(GLenum format, GLenum type,
           (is_unsigned_int && type == GL_UNSIGNED_INT))
          return GL_NO_ERROR;
       break;
+   case GL_DEPTH_STENCIL:
+      switch (type) {
+      case GL_FLOAT_32_UNSIGNED_INT_24_8_REV:
+         if (is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      case GL_UNSIGNED_INT_24_8:
+         if (!is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
+   case GL_DEPTH_COMPONENT:
+      switch (type) {
+      case GL_FLOAT:
+         if (is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      case GL_UNSIGNED_SHORT:
+      case GL_UNSIGNED_INT_24_8:
+         if (!is_float_depth)
+            return GL_NO_ERROR;
+         break;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
+   case GL_STENCIL_INDEX:
+      switch (type) {
+      case GL_UNSIGNED_BYTE:
+         return GL_NO_ERROR;
+      default:
+         return GL_INVALID_ENUM;
+      }
+      break;
    }
 
    return GL_INVALID_OPERATION;
@@ -966,8 +1007,8 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glReadPixels(%d, %d, %s, %s, %p)\n",
                   width, height,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type),
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type),
                   pixels);
 
    if (width < 0 || height < 0) {
@@ -1017,15 +1058,10 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
          err = read_pixels_es3_error_check(format, type, rb);
       }
 
-      if (err == GL_NO_ERROR && (format == GL_DEPTH_COMPONENT
-          || format == GL_DEPTH_STENCIL)) {
-         err = GL_INVALID_ENUM;
-      }
-
       if (err != GL_NO_ERROR) {
          _mesa_error(ctx, err, "glReadPixels(invalid format %s and/or type %s)",
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type));
+                     _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type));
          return;
       }
    }
@@ -1033,8 +1069,8 @@ _mesa_ReadnPixelsARB( GLint x, GLint y, GLsizei width, GLsizei height,
    err = _mesa_error_check_format_and_type(ctx, format, type);
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err, "glReadPixels(invalid format %s and/or type %s)",
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return;
    }
 
diff --git a/src/mesa/main/readpix.h b/src/mesa/main/readpix.h
index 1636dd9ce3e..481ad9d9c37 100644
--- a/src/mesa/main/readpix.h
+++ b/src/mesa/main/readpix.h
@@ -38,7 +38,18 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
                                  GLenum type, GLboolean uses_blit);
 
 extern GLboolean
-_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format);
+_mesa_need_rgb_to_luminance_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat);
+
+extern GLboolean
+_mesa_need_luminance_to_rgb_conversion(GLenum srcBaseFormat,
+                                       GLenum dstBaseFormat);
+
+extern GLbitfield
+_mesa_get_readpixels_transfer_ops(const struct gl_context *ctx,
+                                  mesa_format texFormat,
+                                  GLenum format, GLenum type,
+                                  GLboolean uses_blit);
 
 extern void
 _mesa_readpixels(struct gl_context *ctx,
diff --git a/src/mesa/main/samplerobj.c b/src/mesa/main/samplerobj.c
index a3aacc66aa3..32180fb1ba2 100644
--- a/src/mesa/main/samplerobj.c
+++ b/src/mesa/main/samplerobj.c
@@ -689,7 +689,7 @@ set_sampler_max_anisotropy(struct gl_context *ctx,
    if (samp->MaxAnisotropy == param)
       return GL_FALSE;
 
-   if (param < 1.0)
+   if (param < 1.0F)
       return INVALID_VALUE;
 
    flush(ctx);
@@ -813,7 +813,7 @@ _mesa_SamplerParameteri(GLuint sampler, GLenum pname, GLint param)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteri(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteri(param=%d)\n",
@@ -906,7 +906,7 @@ _mesa_SamplerParameterf(GLuint sampler, GLenum pname, GLfloat param)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterf(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterf(param=%f)\n",
@@ -1006,7 +1006,7 @@ _mesa_SamplerParameteriv(GLuint sampler, GLenum pname, const GLint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteriv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameteriv(param=%d)\n",
@@ -1099,7 +1099,7 @@ _mesa_SamplerParameterfv(GLuint sampler, GLenum pname, const GLfloat *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterfv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterfv(param=%f)\n",
@@ -1184,7 +1184,7 @@ _mesa_SamplerParameterIiv(GLuint sampler, GLenum pname, const GLint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIiv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIiv(param=%d)\n",
@@ -1270,7 +1270,7 @@ _mesa_SamplerParameterIuiv(GLuint sampler, GLenum pname, const GLuint *params)
       break;
    case INVALID_PNAME:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIuiv(pname=%s)\n",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       break;
    case INVALID_PARAM:
       _mesa_error(ctx, GL_INVALID_ENUM, "glSamplerParameterIuiv(param=%u)\n",
@@ -1380,7 +1380,7 @@ _mesa_GetSamplerParameteriv(GLuint sampler, GLenum pname, GLint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameteriv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1466,7 +1466,7 @@ _mesa_GetSamplerParameterfv(GLuint sampler, GLenum pname, GLfloat *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterfv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1545,7 +1545,7 @@ _mesa_GetSamplerParameterIiv(GLuint sampler, GLenum pname, GLint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterIiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1624,7 +1624,7 @@ _mesa_GetSamplerParameterIuiv(GLuint sampler, GLenum pname, GLuint *params)
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetSamplerParameterIuiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index a6246a39aad..ee7320221e2 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -44,7 +44,8 @@ extern "C" {
 
 static GLint
 program_resource_location(struct gl_shader_program *shProg,
-                          struct gl_program_resource *res, const char *name);
+                          struct gl_program_resource *res, const char *name,
+                          unsigned array_index);
 
 /**
  * Declare convenience functions to return resource data in a given type.
@@ -61,6 +62,7 @@ DECL_RESOURCE_FUNC(UBO, gl_uniform_block);
 DECL_RESOURCE_FUNC(UNI, gl_uniform_storage);
 DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer);
 DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
 void GLAPIENTRY
 _mesa_BindAttribLocation(GLhandleARB program, GLuint index,
@@ -189,63 +191,6 @@ _mesa_GetActiveAttrib(GLhandleARB program, GLuint desired_index,
                                   (GLint *) type, "glGetActiveAttrib");
 }
 
-/* Locations associated with shader variables (array or non-array) can be
- * queried using its base name or using the base name appended with the
- * valid array index. For example, in case of below vertex shader, valid
- * queries can be made to know the location of "xyz", "array", "array[0]",
- * "array[1]", "array[2]" and "array[3]". In this example index reurned
- * will be 0, 0, 0, 1, 2, 3 respectively.
- *
- * [Vertex Shader]
- * layout(location=0) in vec4 xyz;
- * layout(location=1) in vec4[4] array;
- * void main()
- * { }
- *
- * This requirement came up with the addition of ARB_program_interface_query
- * to OpenGL 4.3 specification. See page 101 (page 122 of the PDF) for details.
- *
- * This utility function is used by:
- * _mesa_GetAttribLocation
- * _mesa_GetFragDataLocation
- * _mesa_GetFragDataIndex
- *
- * Returns 0:
- *    if the 'name' string matches var->name.
- * Returns 'matched index':
- *    if the 'name' string matches var->name appended with valid array index.
- */
-int static inline
-get_matching_index(const ir_variable *const var, const char *name) {
-   unsigned idx = 0;
-   const char *const paren = strchr(name, '[');
-   const unsigned len = (paren != NULL) ? paren - name : strlen(name);
-
-   if (paren != NULL) {
-      if (!var->type->is_array())
-         return -1;
-
-      char *endptr;
-      idx = (unsigned) strtol(paren + 1, &endptr, 10);
-      const unsigned idx_len = endptr != (paren + 1) ? endptr - paren - 1 : 0;
-
-      /* Validate the sub string representing index in 'name' string */
-      if ((idx > 0 && paren[1] == '0') /* leading zeroes */
-          || (idx == 0 && idx_len > 1) /* all zeroes */
-          || paren[1] == ' ' /* whitespace */
-          || endptr[0] != ']' /* closing brace */
-          || endptr[1] != '\0' /* null char */
-          || idx_len == 0 /* missing index */
-          || idx >= var->type->length) /* exceeding array bound */
-         return -1;
-   }
-
-   if (strncmp(var->name, name, len) == 0 && var->name[len] == '\0')
-      return idx;
-
-   return -1;
-}
-
 GLint GLAPIENTRY
 _mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
 {
@@ -271,13 +216,15 @@ _mesa_GetAttribLocation(GLhandleARB program, const GLcharARB * name)
    if (shProg->_LinkedShaders[MESA_SHADER_VERTEX] == NULL)
       return -1;
 
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, GL_PROGRAM_INPUT, name);
+      _mesa_program_resource_find_name(shProg, GL_PROGRAM_INPUT, name,
+                                       &array_index);
 
    if (!res)
       return -1;
 
-   GLint loc = program_resource_location(shProg, res, name);
+   GLint loc = program_resource_location(shProg, res, name, array_index);
 
    /* The extra check against against 0 is made because of builtin-attribute
     * locations that have offset applied. Function program_resource_location
@@ -455,13 +402,15 @@ _mesa_GetFragDataLocation(GLuint program, const GLchar *name)
    if (shProg->_LinkedShaders[MESA_SHADER_FRAGMENT] == NULL)
       return -1;
 
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, GL_PROGRAM_OUTPUT, name);
+      _mesa_program_resource_find_name(shProg, GL_PROGRAM_OUTPUT, name,
+                                       &array_index);
 
    if (!res)
       return -1;
 
-   GLint loc = program_resource_location(shProg, res, name);
+   GLint loc = program_resource_location(shProg, res, name, array_index);
 
    /* The extra check against against 0 is made because of builtin-attribute
     * locations that have offset applied. Function program_resource_location
@@ -497,6 +446,20 @@ _mesa_program_resource_name(struct gl_program_resource *res)
       return RESOURCE_VAR(res)->name;
    case GL_UNIFORM:
       return RESOURCE_UNI(res)->name;
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      return RESOURCE_UNI(res)->name + MESA_SUBROUTINE_PREFIX_LEN;
+   case GL_VERTEX_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE:
+      return RESOURCE_SUB(res)->name;
    default:
       assert(!"support for resource type not implemented");
    }
@@ -515,7 +478,19 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
    case GL_PROGRAM_OUTPUT:
       return RESOURCE_VAR(res)->data.max_array_access;
    case GL_UNIFORM:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
       return RESOURCE_UNI(res)->array_elements;
+   case GL_VERTEX_SUBROUTINE:
+   case GL_GEOMETRY_SUBROUTINE:
+   case GL_FRAGMENT_SUBROUTINE:
+   case GL_COMPUTE_SUBROUTINE:
+   case GL_TESS_CONTROL_SUBROUTINE:
+   case GL_TESS_EVALUATION_SUBROUTINE:
    case GL_ATOMIC_COUNTER_BUFFER:
    case GL_UNIFORM_BLOCK:
       return 0;
@@ -525,39 +500,32 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
    return 0;
 }
 
-static int
-array_index_of_resource(struct gl_program_resource *res,
-                        const char *name)
+/**
+ * Checks if array subscript is valid and if so sets array_index.
+ */
+static bool
+valid_array_index(const GLchar *name, unsigned *array_index)
 {
-   assert(res->Data);
+   long idx = 0;
+   const GLchar *out_base_name_end;
 
-   switch (res->Type) {
-   case GL_PROGRAM_INPUT:
-   case GL_PROGRAM_OUTPUT:
-      return get_matching_index(RESOURCE_VAR(res), name);
-   default:
-      assert(!"support for resource type not implemented");
-      return -1;
-   }
+   idx = parse_program_resource_name(name, &out_base_name_end);
+   if (idx < 0)
+      return false;
+
+   if (array_index)
+      *array_index = idx;
+
+   return true;
 }
 
 /* Find a program resource with specific name in given interface.
  */
 struct gl_program_resource *
 _mesa_program_resource_find_name(struct gl_shader_program *shProg,
-                                 GLenum programInterface, const char *name)
+                                 GLenum programInterface, const char *name,
+                                 unsigned *array_index)
 {
-   GET_CURRENT_CONTEXT(ctx);
-   const char *full_name = name;
-
-   /* When context has 'VertexID_is_zero_based' set, gl_VertexID has been
-    * lowered to gl_VertexIDMESA.
-    */
-   if (name && ctx->Const.VertexID_is_zero_based) {
-      if (strcmp(name, "gl_VertexID") == 0)
-         full_name = "gl_VertexIDMESA";
-   }
-
    struct gl_program_resource *res = shProg->ProgramResourceList;
    for (unsigned i = 0; i < shProg->NumProgramResourceList; i++, res++) {
       if (res->Type != programInterface)
@@ -567,26 +535,46 @@ _mesa_program_resource_find_name(struct gl_shader_program *shProg,
       const char *rname = _mesa_program_resource_name(res);
       unsigned baselen = strlen(rname);
 
-      switch (programInterface) {
-      case GL_TRANSFORM_FEEDBACK_VARYING:
-      case GL_UNIFORM_BLOCK:
-      case GL_UNIFORM:
-         if (strncmp(rname, name, baselen) == 0) {
+      if (strncmp(rname, name, baselen) == 0) {
+         switch (programInterface) {
+         case GL_UNIFORM_BLOCK:
             /* Basename match, check if array or struct. */
             if (name[baselen] == '\0' ||
                 name[baselen] == '[' ||
                 name[baselen] == '.') {
                return res;
             }
+            break;
+         case GL_TRANSFORM_FEEDBACK_VARYING:
+         case GL_UNIFORM:
+         case GL_VERTEX_SUBROUTINE_UNIFORM:
+         case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+         case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+         case GL_COMPUTE_SUBROUTINE_UNIFORM:
+         case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+         case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+         case GL_VERTEX_SUBROUTINE:
+         case GL_GEOMETRY_SUBROUTINE:
+         case GL_FRAGMENT_SUBROUTINE:
+         case GL_COMPUTE_SUBROUTINE:
+         case GL_TESS_CONTROL_SUBROUTINE:
+         case GL_TESS_EVALUATION_SUBROUTINE:
+            if (name[baselen] == '.') {
+               return res;
+            }
+            /* fall-through */
+         case GL_PROGRAM_INPUT:
+         case GL_PROGRAM_OUTPUT:
+            if (name[baselen] == '\0') {
+               return res;
+            } else if (name[baselen] == '[' &&
+                valid_array_index(name, array_index)) {
+               return res;
+            }
+            break;
+         default:
+            assert(!"not implemented for given interface");
          }
-         break;
-      case GL_PROGRAM_INPUT:
-      case GL_PROGRAM_OUTPUT:
-         if (array_index_of_resource(res, full_name) >= 0)
-            return res;
-         break;
-      default:
-         assert(!"not implemented for given interface");
       }
    }
    return NULL;
@@ -651,6 +639,18 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
       case GL_UNIFORM:
+      case GL_VERTEX_SUBROUTINE_UNIFORM:
+      case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      case GL_VERTEX_SUBROUTINE:
+      case GL_GEOMETRY_SUBROUTINE:
+      case GL_FRAGMENT_SUBROUTINE:
+      case GL_COMPUTE_SUBROUTINE:
+      case GL_TESS_CONTROL_SUBROUTINE:
+      case GL_TESS_EVALUATION_SUBROUTINE:
          if (++idx == (int) index)
             return res;
          break;
@@ -719,6 +719,12 @@ _mesa_get_program_resource_name(struct gl_shader_program *shProg,
    bool add_index = !(((programInterface == GL_PROGRAM_INPUT) &&
                        res->StageReferences & (1 << MESA_SHADER_GEOMETRY)));
 
+   /* Transform feedback varyings have array index already appended
+    * in their names.
+    */
+   if (programInterface == GL_TRANSFORM_FEEDBACK_VARYING)
+      add_index = false;
+
    if (add_index && _mesa_program_resource_array_size(res)) {
       int i;
 
@@ -736,17 +742,9 @@ _mesa_get_program_resource_name(struct gl_shader_program *shProg,
 
 static GLint
 program_resource_location(struct gl_shader_program *shProg,
-                          struct gl_program_resource *res, const char *name)
+                          struct gl_program_resource *res, const char *name,
+                          unsigned array_index)
 {
-   unsigned index, offset;
-   int array_index = -1;
-
-   if (res->Type == GL_PROGRAM_INPUT || res->Type == GL_PROGRAM_OUTPUT) {
-      array_index = array_index_of_resource(res, name);
-      if (array_index < 0)
-         return -1;
-   }
-
    /* Built-in locations should report GL_INVALID_INDEX. */
    if (is_gl_identifier(name))
       return GL_INVALID_INDEX;
@@ -757,13 +755,22 @@ program_resource_location(struct gl_shader_program *shProg,
     */
    switch (res->Type) {
    case GL_PROGRAM_INPUT:
+      /* If the input is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_VAR(res)->type->length) {
+         return -1;
+      }
       return RESOURCE_VAR(res)->data.location + array_index - VERT_ATTRIB_GENERIC0;
    case GL_PROGRAM_OUTPUT:
+      /* If the output is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_VAR(res)->type->length) {
+         return -1;
+      }
       return RESOURCE_VAR(res)->data.location + array_index - FRAG_RESULT_DATA0;
    case GL_UNIFORM:
-      index = _mesa_get_uniform_location(shProg, name, &offset);
-
-      if (index == GL_INVALID_INDEX)
+      /* If the uniform is built-in, fail. */
+      if (RESOURCE_UNI(res)->builtin)
          return -1;
 
       /* From the GL_ARB_uniform_buffer_object spec:
@@ -777,9 +784,21 @@ program_resource_location(struct gl_shader_program *shProg,
           RESOURCE_UNI(res)->atomic_buffer_index != -1)
          return -1;
 
-      /* location in remap table + array element offset */
-      return RESOURCE_UNI(res)->remap_location + offset;
+      /* fallthrough */
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      /* If the uniform is an array, fail if the index is out of bounds. */
+      if (array_index > 0
+          && array_index >= RESOURCE_UNI(res)->array_elements) {
+         return -1;
+      }
 
+      /* location in remap table + array element offset */
+      return RESOURCE_UNI(res)->remap_location + array_index;
    default:
       return -1;
    }
@@ -787,22 +806,22 @@ program_resource_location(struct gl_shader_program *shProg,
 
 /**
  * Function implements following location queries:
- *    glGetAttribLocation
- *    glGetFragDataLocation
  *    glGetUniformLocation
  */
 GLint
 _mesa_program_resource_location(struct gl_shader_program *shProg,
                                 GLenum programInterface, const char *name)
 {
+   unsigned array_index = 0;
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, programInterface, name);
+      _mesa_program_resource_find_name(shProg, programInterface, name,
+                                       &array_index);
 
    /* Resource not found. */
    if (!res)
       return -1;
 
-   return program_resource_location(shProg, res, name);
+   return program_resource_location(shProg, res, name, array_index);
 }
 
 /**
@@ -814,7 +833,7 @@ _mesa_program_resource_location_index(struct gl_shader_program *shProg,
                                       GLenum programInterface, const char *name)
 {
    struct gl_program_resource *res =
-      _mesa_program_resource_find_name(shProg, programInterface, name);
+      _mesa_program_resource_find_name(shProg, programInterface, name, NULL);
 
    /* Non-existent variable or resource is not referenced by fragment stage. */
    if (!res || !(res->StageReferences & (1 << MESA_SHADER_FRAGMENT)))
@@ -829,6 +848,10 @@ stage_from_enum(GLenum ref)
    switch (ref) {
    case GL_REFERENCED_BY_VERTEX_SHADER:
       return MESA_SHADER_VERTEX;
+   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
+      return MESA_SHADER_TESS_EVAL;
    case GL_REFERENCED_BY_GEOMETRY_SHADER:
       return MESA_SHADER_GEOMETRY;
    case GL_REFERENCED_BY_FRAGMENT_SHADER:
@@ -886,7 +909,8 @@ get_buffer_property(struct gl_shader_program *shProg,
          for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) {
             const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName;
             struct gl_program_resource *uni =
-               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname);
+               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname,
+                                                NULL);
             if (!uni)
                continue;
             (*val)++;
@@ -896,7 +920,8 @@ get_buffer_property(struct gl_shader_program *shProg,
          for (unsigned i = 0; i < RESOURCE_UBO(res)->NumUniforms; i++) {
             const char *iname = RESOURCE_UBO(res)->Uniforms[i].IndexName;
             struct gl_program_resource *uni =
-               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname);
+               _mesa_program_resource_find_name(shProg, GL_UNIFORM, iname,
+                                                NULL);
             if (!uni)
                continue;
             *val++ =
@@ -925,8 +950,8 @@ get_buffer_property(struct gl_shader_program *shProg,
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
 
    return 0;
 }
@@ -944,11 +969,17 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
 
    switch(prop) {
    case GL_NAME_LENGTH:
-      if (res->Type == GL_ATOMIC_COUNTER_BUFFER)
+      switch (res->Type) {
+      case GL_ATOMIC_COUNTER_BUFFER:
          goto invalid_operation;
-      /* Base name +3 if array '[0]' + terminator. */
-      *val = strlen(_mesa_program_resource_name(res)) +
-         (_mesa_program_resource_array_size(res) > 0 ? 3 : 0) + 1;
+      case GL_TRANSFORM_FEEDBACK_VARYING:
+         *val = strlen(_mesa_program_resource_name(res)) + 1;
+         break;
+      default:
+         /* Base name +3 if array '[0]' + terminator. */
+         *val = strlen(_mesa_program_resource_name(res)) +
+            (_mesa_program_resource_array_size(res) > 0 ? 3 : 0) + 1;
+      }
       return 1;
    case GL_TYPE:
       switch (res->Type) {
@@ -1014,6 +1045,8 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          goto invalid_enum;
       /* fallthrough */
    case GL_REFERENCED_BY_VERTEX_SHADER:
+   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
+   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
    case GL_REFERENCED_BY_GEOMETRY_SHADER:
    case GL_REFERENCED_BY_FRAGMENT_SHADER:
       switch (res->Type) {
@@ -1034,7 +1067,8 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
          *val = program_resource_location(shProg, res,
-                                          _mesa_program_resource_name(res));
+                                          _mesa_program_resource_name(res),
+                                          0);
          return 1;
       default:
          goto invalid_operation;
@@ -1045,10 +1079,54 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       *val = RESOURCE_VAR(res)->data.index;
       return 1;
 
+   case GL_NUM_COMPATIBLE_SUBROUTINES:
+      if (res->Type != GL_VERTEX_SUBROUTINE_UNIFORM &&
+          res->Type != GL_FRAGMENT_SUBROUTINE_UNIFORM &&
+          res->Type != GL_GEOMETRY_SUBROUTINE_UNIFORM &&
+          res->Type != GL_COMPUTE_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_CONTROL_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_EVALUATION_SUBROUTINE_UNIFORM)
+         goto invalid_operation;
+      *val = RESOURCE_UNI(res)->num_compatible_subroutines;
+      return 1;
+   case GL_COMPATIBLE_SUBROUTINES: {
+      const struct gl_uniform_storage *uni;
+      struct gl_shader *sh;
+      unsigned count, i;
+      int j;
+
+      if (res->Type != GL_VERTEX_SUBROUTINE_UNIFORM &&
+          res->Type != GL_FRAGMENT_SUBROUTINE_UNIFORM &&
+          res->Type != GL_GEOMETRY_SUBROUTINE_UNIFORM &&
+          res->Type != GL_COMPUTE_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_CONTROL_SUBROUTINE_UNIFORM &&
+          res->Type != GL_TESS_EVALUATION_SUBROUTINE_UNIFORM)
+         goto invalid_operation;
+      uni = RESOURCE_UNI(res);
+
+      sh = shProg->_LinkedShaders[_mesa_shader_stage_from_subroutine_uniform(res->Type)];
+      count = 0;
+      for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+         struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+         for (j = 0; j < fn->num_compat_types; j++) {
+            if (fn->types[j] == uni->type) {
+               val[count++] = i;
+               break;
+            }
+         }
+      }
+      return count;
+   }
    /* GL_ARB_tessellation_shader */
    case GL_IS_PER_PATCH:
-   case GL_REFERENCED_BY_TESS_CONTROL_SHADER:
-   case GL_REFERENCED_BY_TESS_EVALUATION_SHADER:
+      switch (res->Type) {
+      case GL_PROGRAM_INPUT:
+      case GL_PROGRAM_OUTPUT:
+         *val = RESOURCE_VAR(res)->data.patch;
+         return 1;
+      default:
+         goto invalid_operation;
+      }
    default:
       goto invalid_enum;
    }
@@ -1057,14 +1135,14 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
    return 0;
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s prop %s)", caller,
-               _mesa_lookup_enum_by_nr(res->Type),
-               _mesa_lookup_enum_by_nr(prop));
+               _mesa_enum_to_string(res->Type),
+               _mesa_enum_to_string(prop));
    return 0;
 }
 
@@ -1086,7 +1164,7 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
    if (!res || bufSize < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glGetProgramResourceiv(%s index %d bufSize %d)",
-                  _mesa_lookup_enum_by_nr(programInterface), index, bufSize);
+                  _mesa_enum_to_string(programInterface), index, bufSize);
       return;
    }
 
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index a4296adf799..f9a7d130f9c 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -110,6 +110,7 @@ _mesa_init_shader_state(struct gl_context *ctx)
     */
    struct gl_shader_compiler_options options;
    gl_shader_stage sh;
+   int i;
 
    memset(&options, 0, sizeof(options));
    options.MaxUnrollIterations = 32;
@@ -126,6 +127,12 @@ _mesa_init_shader_state(struct gl_context *ctx)
    /* Extended for ARB_separate_shader_objects */
    ctx->Shader.RefCount = 1;
    mtx_init(&ctx->Shader.Mutex, mtx_plain);
+
+   ctx->TessCtrlProgram.patch_vertices = 3;
+   for (i = 0; i < 4; ++i)
+      ctx->TessCtrlProgram.patch_default_outer_level[i] = 1.0;
+   for (i = 0; i < 2; ++i)
+      ctx->TessCtrlProgram.patch_default_inner_level[i] = 1.0;
 }
 
 
@@ -199,6 +206,9 @@ _mesa_validate_shader_target(const struct gl_context *ctx, GLenum type)
       return ctx == NULL || ctx->Extensions.ARB_vertex_shader;
    case GL_GEOMETRY_SHADER_ARB:
       return ctx == NULL || _mesa_has_geometry_shaders(ctx);
+   case GL_TESS_CONTROL_SHADER:
+   case GL_TESS_EVALUATION_SHADER:
+      return ctx == NULL || _mesa_has_tessellation(ctx);
    case GL_COMPUTE_SHADER:
       return ctx == NULL || ctx->Extensions.ARB_compute_shader;
    default:
@@ -415,6 +425,8 @@ detach_shader(struct gl_context *ctx, GLuint program, GLuint shader)
          /* sanity check - make sure the new list's entries are sensible */
          for (j = 0; j < shProg->NumShaders; j++) {
             assert(shProg->Shaders[j]->Type == GL_VERTEX_SHADER ||
+                   shProg->Shaders[j]->Type == GL_TESS_CONTROL_SHADER ||
+                   shProg->Shaders[j]->Type == GL_TESS_EVALUATION_SHADER ||
                    shProg->Shaders[j]->Type == GL_GEOMETRY_SHADER ||
                    shProg->Shaders[j]->Type == GL_FRAGMENT_SHADER);
             assert(shProg->Shaders[j]->RefCount > 0);
@@ -510,6 +522,57 @@ check_gs_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
 }
 
 
+/**
+ * Check if a tessellation control shader query is valid at this time.
+ * If not, report an error and return false.
+ *
+ * From GL 4.0 section 6.1.12 (Shader and Program Queries):
+ *
+ *     "If TESS_CONTROL_OUTPUT_VERTICES is queried for a program which has
+ *     not been linked successfully, or which does not contain objects to
+ *     form a tessellation control shader, then an INVALID_OPERATION error is
+ *     generated."
+ */
+static bool
+check_tcs_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
+{
+   if (shProg->LinkStatus &&
+       shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL] != NULL) {
+      return true;
+   }
+
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "glGetProgramv(linked tessellation control shader required)");
+   return false;
+}
+
+
+/**
+ * Check if a tessellation evaluation shader query is valid at this time.
+ * If not, report an error and return false.
+ *
+ * From GL 4.0 section 6.1.12 (Shader and Program Queries):
+ *
+ *     "If any of the pname values in this paragraph are queried for a program
+ *     which has not been linked successfully, or which does not contain
+ *     objects to form a tessellation evaluation shader, then an
+ *     INVALID_OPERATION error is generated."
+ *
+ */
+static bool
+check_tes_query(struct gl_context *ctx, const struct gl_shader_program *shProg)
+{
+   if (shProg->LinkStatus &&
+       shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL] != NULL) {
+      return true;
+   }
+
+   _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramv(linked tessellation "
+               "evaluation shader required)");
+   return false;
+}
+
+
 /**
  * glGetProgramiv() - get shader program state.
  * Note that this is for GLSL shader programs, not ARB vertex/fragment
@@ -533,6 +596,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
     * and GL 3.2) are available in this context
     */
    const bool has_core_gs = _mesa_has_geometry_shaders(ctx);
+   const bool has_tess = _mesa_has_tessellation(ctx);
 
    /* Are uniform buffer objects available in this context?
     */
@@ -711,12 +775,44 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
    case GL_PROGRAM_SEPARABLE:
       *params = shProg->SeparateShader;
       return;
+
+   /* ARB_tessellation_shader */
+   case GL_TESS_CONTROL_OUTPUT_VERTICES:
+      if (!has_tess)
+         break;
+      if (check_tcs_query(ctx, shProg))
+         *params = shProg->TessCtrl.VerticesOut;
+      return;
+   case GL_TESS_GEN_MODE:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.PrimitiveMode;
+      return;
+   case GL_TESS_GEN_SPACING:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.Spacing;
+      return;
+   case GL_TESS_GEN_VERTEX_ORDER:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.VertexOrder;
+      return;
+   case GL_TESS_GEN_POINT_MODE:
+      if (!has_tess)
+         break;
+      if (check_tes_query(ctx, shProg))
+         *params = shProg->TessEval.PointMode;
+      return;
    default:
       break;
    }
 
    _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramiv(pname=%s)",
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -992,6 +1088,12 @@ print_shader_info(const struct gl_shader_program *shProg)
    if (shProg->_LinkedShaders[MESA_SHADER_GEOMETRY])
       printf("  geom prog %u\n",
 	     shProg->_LinkedShaders[MESA_SHADER_GEOMETRY]->Program->Id);
+   if (shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL])
+      printf("  tesc prog %u\n",
+	     shProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program->Id);
+   if (shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL])
+      printf("  tese prog %u\n",
+	     shProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]->Program->Id);
 }
 
 
@@ -1037,11 +1139,9 @@ use_shader_program(struct gl_context *ctx, gl_shader_stage stage,
        */
       switch (stage) {
       case MESA_SHADER_VERTEX:
-	 /* Empty for now. */
-	 break;
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
       case MESA_SHADER_GEOMETRY:
-	 /* Empty for now. */
-	 break;
       case MESA_SHADER_COMPUTE:
          /* Empty for now. */
          break;
@@ -1071,6 +1171,7 @@ _mesa_use_program(struct gl_context *ctx, struct gl_shader_program *shProg)
       use_shader_program(ctx, i, shProg, &ctx->Shader);
    _mesa_active_program(ctx, shProg, "glUseProgram");
 
+   _mesa_shader_program_init_subroutine_defaults(shProg);
    if (ctx->Driver.UseProgram)
       ctx->Driver.UseProgram(ctx, shProg);
 }
@@ -1172,7 +1273,7 @@ _mesa_CreateShader(GLenum type)
 {
    GET_CURRENT_CONTEXT(ctx);
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glCreateShader %s\n", _mesa_lookup_enum_by_nr(type));
+      _mesa_debug(ctx, "glCreateShader %s\n", _mesa_enum_to_string(type));
    return create_shader(ctx, type);
 }
 
@@ -1331,7 +1432,7 @@ void GLAPIENTRY
 _mesa_GetObjectParameterfvARB(GLhandleARB object, GLenum pname,
                               GLfloat *params)
 {
-   GLint iparams[1];  /* XXX is one element enough? */
+   GLint iparams[1] = {0};  /* XXX is one element enough? */
    _mesa_GetObjectParameterivARB(object, pname, iparams);
    params[0] = (GLfloat) iparams[0];
 }
@@ -1460,7 +1561,7 @@ read_shader(const char *fname)
  */
 void GLAPIENTRY
 _mesa_ShaderSource(GLhandleARB shaderObj, GLsizei count,
-                      const GLcharARB * const * string, const GLint * length)
+                   const GLcharARB * const * string, const GLint * length)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLint *offsets;
@@ -1692,12 +1793,23 @@ _mesa_ShaderBinary(GLint n, const GLuint* shaders, GLenum binaryformat,
                    const void* binary, GLint length)
 {
    GET_CURRENT_CONTEXT(ctx);
-   (void) n;
    (void) shaders;
    (void) binaryformat;
    (void) binary;
-   (void) length;
-   _mesa_error(ctx, GL_INVALID_OPERATION, "glShaderBinary");
+
+   /* Page 68, section 7.2 'Shader Binaries" of the of the OpenGL ES 3.1, and
+    * page 88 of the OpenGL 4.5 specs state:
+    *
+    *     "An INVALID_VALUE error is generated if count or length is negative.
+    *      An INVALID_ENUM error is generated if binaryformat is not a supported
+    *      format returned in SHADER_BINARY_FORMATS."
+    */
+   if (n < 0 || length < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glShaderBinary(count or length < 0)");
+      return;
+   }
+
+   _mesa_error(ctx, GL_INVALID_ENUM, "glShaderBinary(format)");
 }
 
 
@@ -1857,7 +1969,7 @@ _mesa_ProgramParameteri(GLuint program, GLenum pname, GLint value)
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glProgramParameteri(pname=%s)",
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 
@@ -1865,7 +1977,7 @@ invalid_value:
    _mesa_error(ctx, GL_INVALID_VALUE,
                "glProgramParameteri(pname=%s, value=%d): "
                "value must be 0 or 1.",
-               _mesa_lookup_enum_by_nr(pname),
+               _mesa_enum_to_string(pname),
                value);
 }
 
@@ -1885,7 +1997,8 @@ _mesa_use_shader_program(struct gl_context *ctx, GLenum type,
 
 static GLuint
 _mesa_create_shader_program(struct gl_context* ctx, GLboolean separate,
-                            GLenum type, GLsizei count, const GLchar* const *strings)
+                            GLenum type, GLsizei count,
+                            const GLchar* const *strings)
 {
    const GLuint shader = create_shader(ctx, type);
    GLuint program = 0;
@@ -1920,8 +2033,8 @@ _mesa_create_shader_program(struct gl_context* ctx, GLboolean separate,
 	    }
 #endif
 	 }
-
-	 ralloc_strcat(&shProg->InfoLog, sh->InfoLog);
+         if (sh->InfoLog)
+            ralloc_strcat(&shProg->InfoLog, sh->InfoLog);
       }
 
       delete_shader(ctx, shader);
@@ -1944,6 +2057,22 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
    case MESA_SHADER_VERTEX:
       dst->UsesClipDistanceOut = src->Vert.UsesClipDistance;
       break;
+   case MESA_SHADER_TESS_CTRL: {
+      struct gl_tess_ctrl_program *dst_tcp =
+         (struct gl_tess_ctrl_program *) dst;
+      dst_tcp->VerticesOut = src->TessCtrl.VerticesOut;
+      break;
+   }
+   case MESA_SHADER_TESS_EVAL: {
+      struct gl_tess_eval_program *dst_tep =
+         (struct gl_tess_eval_program *) dst;
+      dst_tep->PrimitiveMode = src->TessEval.PrimitiveMode;
+      dst_tep->Spacing = src->TessEval.Spacing;
+      dst_tep->VertexOrder = src->TessEval.VertexOrder;
+      dst_tep->PointMode = src->TessEval.PointMode;
+      dst->UsesClipDistanceOut = src->TessEval.UsesClipDistance;
+      break;
+   }
    case MESA_SHADER_GEOMETRY: {
       struct gl_geometry_program *dst_gp = (struct gl_geometry_program *) dst;
       dst_gp->VerticesIn = src->Geom.VerticesIn;
@@ -1954,20 +2083,20 @@ _mesa_copy_linked_program_data(gl_shader_stage type,
       dst->UsesClipDistanceOut = src->Geom.UsesClipDistance;
       dst_gp->UsesEndPrimitive = src->Geom.UsesEndPrimitive;
       dst_gp->UsesStreams = src->Geom.UsesStreams;
-   }
       break;
+   }
    case MESA_SHADER_FRAGMENT: {
       struct gl_fragment_program *dst_fp = (struct gl_fragment_program *) dst;
       dst_fp->FragDepthLayout = src->FragDepthLayout;
-   }
       break;
+   }
    case MESA_SHADER_COMPUTE: {
       struct gl_compute_program *dst_cp = (struct gl_compute_program *) dst;
       int i;
       for (i = 0; i < 3; i++)
          dst_cp->LocalSize[i] = src->Comp.LocalSize[i];
-   }
       break;
+   }
    default:
       break;
    }
@@ -1984,3 +2113,568 @@ _mesa_CreateShaderProgramv(GLenum type, GLsizei count,
 
    return _mesa_create_shader_program(ctx, GL_TRUE, type, count, strings);
 }
+
+
+/**
+ * For GL_ARB_tessellation_shader
+ */
+extern void GLAPIENTRY
+_mesa_PatchParameteri(GLenum pname, GLint value)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_tessellation(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glPatchParameteri");
+      return;
+   }
+
+   if (pname != GL_PATCH_VERTICES) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glPatchParameteri");
+      return;
+   }
+
+   if (value <= 0 || value > ctx->Const.MaxPatchVertices) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glPatchParameteri");
+      return;
+   }
+
+   ctx->TessCtrlProgram.patch_vertices = value;
+}
+
+
+extern void GLAPIENTRY
+_mesa_PatchParameterfv(GLenum pname, const GLfloat *values)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_has_tessellation(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glPatchParameterfv");
+      return;
+   }
+
+   switch(pname) {
+   case GL_PATCH_DEFAULT_OUTER_LEVEL:
+      FLUSH_VERTICES(ctx, 0);
+      memcpy(ctx->TessCtrlProgram.patch_default_outer_level, values,
+             4 * sizeof(GLfloat));
+      ctx->NewDriverState |= ctx->DriverFlags.NewDefaultTessLevels;
+      return;
+   case GL_PATCH_DEFAULT_INNER_LEVEL:
+      FLUSH_VERTICES(ctx, 0);
+      memcpy(ctx->TessCtrlProgram.patch_default_inner_level, values,
+             2 * sizeof(GLfloat));
+      ctx->NewDriverState |= ctx->DriverFlags.NewDefaultTessLevels;
+      return;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM, "glPatchParameterfv");
+      return;
+   }
+}
+
+/**
+ * ARB_shader_subroutine
+ */
+GLint GLAPIENTRY
+_mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
+                                   const GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetSubroutineUniformLocation";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return -1;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+   return _mesa_program_resource_location(shProg, resource_type, name);
+}
+
+GLuint GLAPIENTRY
+_mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
+                         const GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetSubroutineIndex";
+   struct gl_shader_program *shProg;
+   struct gl_program_resource *res;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return -1;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return -1;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine(stage);
+   res = _mesa_program_resource_find_name(shProg, resource_type, name, NULL);
+   if (!res) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+     return -1;
+   }
+
+   return _mesa_program_resource_index(shProg, res);
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
+                                   GLuint index, GLenum pname, GLint *values)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineUniformiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+   struct gl_program_resource *res;
+   const struct gl_uniform_storage *uni;
+   GLenum resource_type;
+   int count, i, j;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   switch (pname) {
+   case GL_NUM_COMPATIBLE_SUBROUTINES: {
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         values[0] = uni->num_compatible_subroutines;
+      }
+      break;
+   }
+   case GL_COMPATIBLE_SUBROUTINES: {
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         count = 0;
+         for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+            struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+            for (j = 0; j < fn->num_compat_types; j++) {
+               if (fn->types[j] == uni->type) {
+                  values[count++] = i;
+                  break;
+               }
+            }
+         }
+      }
+      break;
+   }
+   case GL_UNIFORM_SIZE:
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         uni = res->Data;
+         values[0] = uni->array_elements ? uni->array_elements : 1;
+      }
+      break;
+   case GL_UNIFORM_NAME_LENGTH:
+      res = _mesa_program_resource_find_index(shProg, resource_type, index);
+      if (res) {
+         values[0] = strlen(_mesa_program_resource_name(res)) + 1
+            + ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);;
+      }
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
+                                     GLuint index, GLsizei bufsize,
+                                     GLsizei *length, GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineUniformName";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+   /* get program resource name */
+   _mesa_get_program_resource_name(shProg, resource_type,
+                                   index, bufsize,
+                                   length, name, api_name);
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
+                              GLuint index, GLsizei bufsize,
+                              GLsizei *length, GLchar *name)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetActiveSubroutineName";
+   struct gl_shader_program *shProg;
+   GLenum resource_type;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   if (!shProg->_LinkedShaders[stage]) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+   resource_type = _mesa_shader_stage_to_subroutine(stage);
+   _mesa_get_program_resource_name(shProg, resource_type,
+                                   index, bufsize,
+                                   length, name, api_name);
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
+                            const GLuint *indices)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glUniformSubroutinesuiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+   int i;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   shProg = ctx->_Shader->CurrentProgram[stage];
+   if (!shProg) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (count != sh->NumSubroutineUniformRemapTable) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
+      return;
+   }
+
+   i = 0;
+   do {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count = uni->array_elements ? uni->array_elements : 1;
+      int j, k;
+
+      for (j = i; j < i + uni_count; j++) {
+         struct gl_subroutine_function *subfn;
+         if (indices[j] >= sh->NumSubroutineFunctions) {
+            _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
+            return;
+         }
+
+         subfn = &sh->SubroutineFunctions[indices[j]];
+         for (k = 0; k < subfn->num_compat_types; k++) {
+            if (subfn->types[k] == uni->type)
+               break;
+         }
+         if (k == subfn->num_compat_types) {
+            _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+            return;
+         }
+      }
+      i += uni_count;
+   } while(i < count);
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
+   i = 0;
+   do {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count = uni->array_elements ? uni->array_elements : 1;
+
+      memcpy(&uni->storage[0], &indices[i],
+             sizeof(GLuint) * uni_count);
+
+      uni->initialized = true;
+      _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
+      i += uni_count;
+   } while(i < count);
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetUniformSubroutineuiv(GLenum shadertype, GLint location,
+                              GLuint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetUniformSubroutineuiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   shProg = ctx->_Shader->CurrentProgram[stage];
+   if (!shProg) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (location >= sh->NumSubroutineUniformRemapTable) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s", api_name);
+      return;
+   }
+
+   {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[location];
+      int offset = location - uni->subroutine[stage].index;
+      memcpy(params, &uni->storage[offset],
+	     sizeof(GLuint));
+   }
+}
+
+
+GLvoid GLAPIENTRY
+_mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
+                        GLenum pname, GLint *values)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   const char *api_name = "glGetProgramStageiv";
+   struct gl_shader_program *shProg;
+   struct gl_shader *sh;
+   gl_shader_stage stage;
+
+   if (!_mesa_has_shader_subroutine(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   if (!_mesa_validate_shader_target(ctx, shadertype)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   shProg = _mesa_lookup_shader_program_err(ctx, program, api_name);
+   if (!shProg)
+      return;
+
+   stage = _mesa_shader_enum_to_shader_stage(shadertype);
+   sh = shProg->_LinkedShaders[stage];
+   if (!sh) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s", api_name);
+      return;
+   }
+
+   switch (pname) {
+   case GL_ACTIVE_SUBROUTINES:
+      values[0] = sh->NumSubroutineFunctions;
+      break;
+   case GL_ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS:
+      values[0] = sh->NumSubroutineUniformRemapTable;
+      break;
+   case GL_ACTIVE_SUBROUTINE_UNIFORMS:
+      values[0] = sh->NumSubroutineUniformTypes;
+      break;
+   case GL_ACTIVE_SUBROUTINE_MAX_LENGTH:
+   {
+      unsigned i;
+      GLint max_len = 0;
+      GLenum resource_type;
+      struct gl_program_resource *res;
+
+      resource_type = _mesa_shader_stage_to_subroutine(stage);
+      for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+         res = _mesa_program_resource_find_index(shProg, resource_type, i);
+         if (res) {
+            const GLint len = strlen(_mesa_program_resource_name(res)) + 1;
+            if (len > max_len)
+               max_len = len;
+         }
+      }
+      values[0] = max_len;
+      break;
+   }
+   case GL_ACTIVE_SUBROUTINE_UNIFORM_MAX_LENGTH:
+   {
+      unsigned i;
+      GLint max_len = 0;
+      GLenum resource_type;
+      struct gl_program_resource *res;
+
+      resource_type = _mesa_shader_stage_to_subroutine_uniform(stage);
+      for (i = 0; i < sh->NumSubroutineUniformRemapTable; i++) {
+         res = _mesa_program_resource_find_index(shProg, resource_type, i);
+         if (res) {
+            const GLint len = strlen(_mesa_program_resource_name(res)) + 1
+               + ((_mesa_program_resource_array_size(res) != 0) ? 3 : 0);
+
+            if (len > max_len)
+               max_len = len;
+         }
+      }
+      values[0] = max_len;
+      break;
+   }
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", api_name);
+      values[0] = -1;
+      break;
+   }
+}
+
+static int
+find_compat_subroutine(struct gl_shader *sh, const struct glsl_type *type)
+{
+   int i, j;
+
+   for (i = 0; i < sh->NumSubroutineFunctions; i++) {
+      struct gl_subroutine_function *fn = &sh->SubroutineFunctions[i];
+      for (j = 0; j < fn->num_compat_types; j++) {
+         if (fn->types[j] == type)
+            return i;
+      }
+   }
+   return 0;
+}
+
+static void
+_mesa_shader_init_subroutine_defaults(struct gl_shader *sh)
+{
+   int i, j;
+
+   for (i = 0; i < sh->NumSubroutineUniformRemapTable; i++) {
+      struct gl_uniform_storage *uni = sh->SubroutineUniformRemapTable[i];
+      int uni_count;
+      int val;
+
+      if (!uni)
+         continue;
+      uni_count = uni->array_elements ? uni->array_elements : 1;
+      val = find_compat_subroutine(sh, uni->type);
+
+      for (j = 0; j < uni_count; j++)
+         memcpy(&uni->storage[j], &val, sizeof(int));
+      uni->initialized = true;
+      _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
+   }
+}
+
+void
+_mesa_shader_program_init_subroutine_defaults(struct gl_shader_program *shProg)
+{
+   int i;
+
+   if (!shProg)
+      return;
+
+   for (i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (!shProg->_LinkedShaders[i])
+         continue;
+
+      _mesa_shader_init_subroutine_defaults(shProg->_LinkedShaders[i]);
+   }
+}
diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
index aba6d5d8306..0a10191684f 100644
--- a/src/mesa/main/shaderapi.h
+++ b/src/mesa/main/shaderapi.h
@@ -232,7 +232,8 @@ _mesa_program_resource_index(struct gl_shader_program *shProg,
 
 extern struct gl_program_resource *
 _mesa_program_resource_find_name(struct gl_shader_program *shProg,
-                                 GLenum programInterface, const char *name);
+                                 GLenum programInterface, const char *name,
+                                 unsigned *array_index);
 
 extern struct gl_program_resource *
 _mesa_program_resource_find_index(struct gl_shader_program *shProg,
@@ -264,6 +265,51 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
                              GLsizei bufSize, GLsizei *length,
                              GLint *params);
 
+/* GL_ARB_tessellation_shader */
+extern void GLAPIENTRY
+_mesa_PatchParameteri(GLenum pname, GLint value);
+
+extern void GLAPIENTRY
+_mesa_PatchParameterfv(GLenum pname, const GLfloat *values);
+
+/* GL_ARB_shader_subroutine */
+void
+_mesa_shader_program_init_subroutine_defaults(struct gl_shader_program *shProg);
+
+extern GLint GLAPIENTRY
+_mesa_GetSubroutineUniformLocation(GLuint program, GLenum shadertype,
+                                   const GLchar *name);
+
+extern GLuint GLAPIENTRY
+_mesa_GetSubroutineIndex(GLuint program, GLenum shadertype,
+                         const GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformiv(GLuint program, GLenum shadertype,
+                                   GLuint index, GLenum pname, GLint *values);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineUniformName(GLuint program, GLenum shadertype,
+                                     GLuint index, GLsizei bufsize,
+                                     GLsizei *length, GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetActiveSubroutineName(GLuint program, GLenum shadertype,
+                              GLuint index, GLsizei bufsize,
+                              GLsizei *length, GLchar *name);
+
+extern GLvoid GLAPIENTRY
+_mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
+                            const GLuint *indices);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetUniformSubroutineuiv(GLenum shadertype, GLint location,
+                              GLuint *params);
+
+extern GLvoid GLAPIENTRY
+_mesa_GetProgramStageiv(GLuint program, GLenum shadertype,
+                        GLenum pname, GLint *values);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index 80b77275f93..a348cdb0405 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -610,7 +610,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
                         "glBindImageTextures(the internal format %s of "
                         "the level zero texture image of textures[%d]=%u "
                         "is not supported)",
-                        _mesa_lookup_enum_by_nr(tex_format),
+                        _mesa_enum_to_string(tex_format),
                         i, texture);
             continue;
          }
diff --git a/src/mesa/main/shaderobj.h b/src/mesa/main/shaderobj.h
index 3d696a1887e..943044e37cd 100644
--- a/src/mesa/main/shaderobj.h
+++ b/src/mesa/main/shaderobj.h
@@ -111,6 +111,10 @@ _mesa_shader_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_SHADER:
       return MESA_SHADER_GEOMETRY;
+   case GL_TESS_CONTROL_SHADER:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SHADER:
+      return MESA_SHADER_TESS_EVAL;
    case GL_COMPUTE_SHADER:
       return MESA_SHADER_COMPUTE;
    default:
@@ -119,6 +123,107 @@ _mesa_shader_enum_to_shader_stage(GLenum v)
    }
 }
 
+/* 8 bytes + another underscore */
+#define MESA_SUBROUTINE_PREFIX_LEN 9
+static inline const char *
+_mesa_shader_stage_to_subroutine_prefix(gl_shader_stage stage)
+{
+  switch (stage) {
+  case MESA_SHADER_VERTEX:
+    return "__subu_v";
+  case MESA_SHADER_GEOMETRY:
+    return "__subu_g";
+  case MESA_SHADER_FRAGMENT:
+    return "__subu_f";
+  case MESA_SHADER_COMPUTE:
+    return "__subu_c";
+  case MESA_SHADER_TESS_CTRL:
+    return "__subu_t";
+  case MESA_SHADER_TESS_EVAL:
+    return "__subu_e";
+  default:
+    return NULL;
+  }
+}
+
+static inline gl_shader_stage
+_mesa_shader_stage_from_subroutine_uniform(GLenum subuniform)
+{
+   switch (subuniform) {
+   default:
+   case GL_VERTEX_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_VERTEX;
+   case GL_GEOMETRY_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_GEOMETRY;
+   case GL_FRAGMENT_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_FRAGMENT;
+   case GL_COMPUTE_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_COMPUTE;
+   case GL_TESS_CONTROL_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM:
+      return MESA_SHADER_TESS_EVAL;
+   }
+}
+
+static inline gl_shader_stage
+_mesa_shader_stage_from_subroutine(GLenum subroutine)
+{
+   switch (subroutine) {
+   case GL_VERTEX_SUBROUTINE:
+      return MESA_SHADER_VERTEX;
+   case GL_GEOMETRY_SUBROUTINE:
+      return MESA_SHADER_GEOMETRY;
+   case GL_FRAGMENT_SUBROUTINE:
+      return MESA_SHADER_FRAGMENT;
+   case GL_COMPUTE_SUBROUTINE:
+      return MESA_SHADER_COMPUTE;
+   case GL_TESS_CONTROL_SUBROUTINE:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_SUBROUTINE:
+      return MESA_SHADER_TESS_EVAL;
+   }
+}
+
+static inline GLenum
+_mesa_shader_stage_to_subroutine(gl_shader_stage stage)
+{
+   switch (stage) {
+   default:
+   case MESA_SHADER_VERTEX:
+      return GL_VERTEX_SUBROUTINE;
+   case MESA_SHADER_GEOMETRY:
+      return GL_GEOMETRY_SUBROUTINE;
+   case MESA_SHADER_FRAGMENT:
+      return GL_FRAGMENT_SUBROUTINE;
+   case MESA_SHADER_COMPUTE:
+      return GL_COMPUTE_SUBROUTINE;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_SUBROUTINE;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_SUBROUTINE;
+   }
+}
+
+static inline GLenum
+_mesa_shader_stage_to_subroutine_uniform(gl_shader_stage stage)
+{
+   switch (stage) {
+   default:
+   case MESA_SHADER_VERTEX:
+      return GL_VERTEX_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_GEOMETRY:
+      return GL_GEOMETRY_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_FRAGMENT:
+      return GL_FRAGMENT_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_COMPUTE:
+      return GL_COMPUTE_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_SUBROUTINE_UNIFORM;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_SUBROUTINE_UNIFORM;
+   }
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index bede7fe1d0e..d3b1c72b08d 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -79,8 +79,8 @@ update_program_enables(struct gl_context *ctx)
 
 
 /**
- * Update the ctx->Vertex/Geometry/FragmentProgram._Current pointers to point
- * to the current/active programs.  Then call ctx->Driver.BindProgram() to
+ * Update the ctx->*Program._Current pointers to point to the
+ * current/active programs.  Then call ctx->Driver.BindProgram() to
  * tell the driver which programs to use.
  *
  * Programs may come from 3 sources: GLSL shaders, ARB/NV_vertex/fragment
@@ -97,6 +97,10 @@ update_program(struct gl_context *ctx)
 {
    const struct gl_shader_program *vsProg =
       ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
+   const struct gl_shader_program *tcsProg =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
+   const struct gl_shader_program *tesProg =
+      ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
    const struct gl_shader_program *gsProg =
       ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
    struct gl_shader_program *fsProg =
@@ -106,6 +110,8 @@ update_program(struct gl_context *ctx)
    const struct gl_vertex_program *prevVP = ctx->VertexProgram._Current;
    const struct gl_fragment_program *prevFP = ctx->FragmentProgram._Current;
    const struct gl_geometry_program *prevGP = ctx->GeometryProgram._Current;
+   const struct gl_tess_ctrl_program *prevTCP = ctx->TessCtrlProgram._Current;
+   const struct gl_tess_eval_program *prevTEP = ctx->TessEvalProgram._Current;
    const struct gl_compute_program *prevCP = ctx->ComputeProgram._Current;
    GLbitfield new_state = 0x0;
 
@@ -175,6 +181,30 @@ update_program(struct gl_context *ctx)
       _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
    }
 
+   if (tesProg && tesProg->LinkStatus
+       && tesProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]) {
+      /* Use GLSL tessellation evaluation shader */
+      _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current,
+         gl_tess_eval_program(
+            tesProg->_LinkedShaders[MESA_SHADER_TESS_EVAL]->Program));
+   }
+   else {
+      /* No tessellation evaluation program */
+      _mesa_reference_tesseprog(ctx, &ctx->TessEvalProgram._Current, NULL);
+   }
+
+   if (tcsProg && tcsProg->LinkStatus
+       && tcsProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
+      /* Use GLSL tessellation control shader */
+      _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current,
+         gl_tess_ctrl_program(
+            tcsProg->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program));
+   }
+   else {
+      /* No tessellation control program */
+      _mesa_reference_tesscprog(ctx, &ctx->TessCtrlProgram._Current, NULL);
+   }
+
    /* Examine vertex program after fragment program as
     * _mesa_get_fixed_func_vertex_program() needs to know active
     * fragprog inputs.
@@ -230,6 +260,22 @@ update_program(struct gl_context *ctx)
       }
    }
 
+   if (ctx->TessEvalProgram._Current != prevTEP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, GL_TESS_EVALUATION_PROGRAM_NV,
+                            (struct gl_program *) ctx->TessEvalProgram._Current);
+      }
+   }
+
+   if (ctx->TessCtrlProgram._Current != prevTCP) {
+      new_state |= _NEW_PROGRAM;
+      if (ctx->Driver.BindProgram) {
+         ctx->Driver.BindProgram(ctx, GL_TESS_CONTROL_PROGRAM_NV,
+                            (struct gl_program *) ctx->TessCtrlProgram._Current);
+      }
+   }
+
    if (ctx->VertexProgram._Current != prevVP) {
       new_state |= _NEW_PROGRAM;
       if (ctx->Driver.BindProgram) {
@@ -266,8 +312,8 @@ update_program_constants(struct gl_context *ctx)
       }
    }
 
-   /* Don't handle geometry shaders here. They don't use any state
-    * constants.
+   /* Don't handle tessellation and geometry shaders here. They don't use
+    * any state constants.
     */
 
    if (ctx->VertexProgram._Current) {
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 800720b798e..af89d2c1cfb 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -563,6 +563,8 @@ const struct function common_desktop_functions_possible[] = {
 
    /* GL 4.0 */
    { "glMinSampleShading", 40, -1 },
+   { "glPatchParameteri", 40, -1 },
+   { "glPatchParameterfv", 40, -1 },
    { "glBlendEquationi", 40, -1 },
    { "glBlendEquationSeparatei", 40, -1 },
    { "glBlendFunci", 40, -1 },
@@ -930,6 +932,11 @@ const struct function common_desktop_functions_possible[] = {
 
    /* GL_EXT_polygon_offset_clamp */
    { "glPolygonOffsetClampEXT", 11, -1 },
+
+   /* GL_ARB_get_texture_sub_image */
+   { "glGetTextureSubImage", 20, -1 },
+   { "glGetCompressedTextureSubImage", 20, -1 },
+
    { NULL, 0, -1 }
 };
 
@@ -1424,6 +1431,16 @@ const struct function gl_core_functions_possible[] = {
    /* GL 3.2 */
    { "glFramebufferTexture", 32, -1 },
 
+   /* GL 4.0 */
+   { "glGetSubroutineUniformLocation", 40, -1 },
+   { "glGetSubroutineIndex", 40, -1 },
+   { "glGetActiveSubroutineUniformiv", 40, -1 },
+   { "glGetActiveSubroutineUniformName", 40, -1 },
+   { "glGetActiveSubroutineName", 40, -1 },
+   { "glUniformSubroutinesuiv", 40, -1 },
+   { "glGetUniformSubroutineuiv", 40, -1 },
+   { "glGetProgramStageiv", 40, -1 },
+
    /* GL 4.3 */
    { "glIsRenderbuffer", 43, -1 },
    { "glBindRenderbuffer", 43, -1 },
@@ -1562,16 +1579,6 @@ const struct function gl_core_functions_possible[] = {
    { "glUniformMatrix4x2dv", 40, -1 },
    { "glUniformMatrix4x3dv", 40, -1 },
    { "glGetUniformdv", 43, -1 },
-// { "glGetSubroutineUniformLocation", 43, -1 },        // XXX: Add to xml
-// { "glGetSubroutineIndex", 43, -1 },                  // XXX: Add to xml
-// { "glGetActiveSubroutineUniformiv", 43, -1 },        // XXX: Add to xml
-// { "glGetActiveSubroutineUniformName", 43, -1 },      // XXX: Add to xml
-// { "glGetActiveSubroutineName", 43, -1 },             // XXX: Add to xml
-// { "glUniformSubroutinesuiv", 43, -1 },               // XXX: Add to xml
-// { "glGetUniformSubroutineuiv", 43, -1 },             // XXX: Add to xml
-// { "glGetProgramStageiv", 43, -1 },                   // XXX: Add to xml
-// { "glPatchParameteri", 43, -1 },                     // XXX: Add to xml
-// { "glPatchParameterfv", 43, -1 },                    // XXX: Add to xml
 
    { "glBindTransformFeedback", 43, -1 },
    { "glDeleteTransformFeedbacks", 43, -1 },
diff --git a/src/mesa/main/tests/enum_strings.cpp b/src/mesa/main/tests/enum_strings.cpp
index dc5fe751a86..8218cc9a685 100644
--- a/src/mesa/main/tests/enum_strings.cpp
+++ b/src/mesa/main/tests/enum_strings.cpp
@@ -39,13 +39,13 @@ TEST(EnumStrings, LookUpByNumber)
 {
    for (unsigned i = 0; everything[i].name != NULL; i++) {
       EXPECT_STREQ(everything[i].name,
-		   _mesa_lookup_enum_by_nr(everything[i].value));
+		   _mesa_enum_to_string(everything[i].value));
    }
 }
 
 TEST(EnumStrings, LookUpUnknownNumber)
 {
-   EXPECT_STRCASEEQ("0xEEEE", _mesa_lookup_enum_by_nr(0xEEEE));
+   EXPECT_STRCASEEQ("0xEEEE", _mesa_enum_to_string(0xEEEE));
 }
 
 /* Please type the name and the value.  This makes it easier to detect
@@ -1731,6 +1731,10 @@ const struct enum_info everything[] = {
    { 0x8DDF, "GL_MAX_GEOMETRY_UNIFORM_COMPONENTS" },
    { 0x8DE0, "GL_MAX_GEOMETRY_OUTPUT_VERTICES" },
    { 0x8DE1, "GL_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS" },
+   { 0x8DE5, "GL_ACTIVE_SUBROUTINES" },
+   { 0x8DE6, "GL_ACTIVE_SUBROUTINE_UNIFORMS" },
+   { 0x8DE7, "GL_MAX_SUBROUTINES" },
+   { 0x8DE8, "GL_MAX_SUBROUTINE_UNIFORM_LOCATIONS" },
    { 0x8DF0, "GL_LOW_FLOAT" },
    { 0x8DF1, "GL_MEDIUM_FLOAT" },
    { 0x8DF2, "GL_HIGH_FLOAT" },
@@ -1759,6 +1763,11 @@ const struct enum_info everything[] = {
    { 0x8E44, "GL_TEXTURE_SWIZZLE_B" },
    { 0x8E45, "GL_TEXTURE_SWIZZLE_A" },
    { 0x8E46, "GL_TEXTURE_SWIZZLE_RGBA" },
+   { 0x8E47, "GL_ACTIVE_SUBROUTINE_UNIFORM_LOCATIONS" },
+   { 0x8E48, "GL_ACTIVE_SUBROUTINE_MAX_LENGTH" },
+   { 0x8E49, "GL_ACTIVE_SUBROUTINE_UNIFORM_MAX_LENGTH" },
+   { 0x8E4A, "GL_NUM_COMPATIBLE_SUBROUTINES" },
+   { 0x8E4B, "GL_COMPATIBLE_SUBROUTINES" },
    { 0x8E4C, "GL_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION" },
    { 0x8E4D, "GL_FIRST_VERTEX_CONVENTION" },
    { 0x8E4E, "GL_LAST_VERTEX_CONVENTION" },
diff --git a/src/mesa/main/texenv.c b/src/mesa/main/texenv.c
index 3edafc0f776..091922161c5 100644
--- a/src/mesa/main/texenv.c
+++ b/src/mesa/main/texenv.c
@@ -42,7 +42,7 @@
 
 
 #define TE_ERROR(errCode, msg, value)				\
-   _mesa_error(ctx, errCode, msg, _mesa_lookup_enum_by_nr(value));
+   _mesa_error(ctx, errCode, msg, _mesa_enum_to_string(value));
 
 
 /** Set texture env mode */
@@ -482,16 +482,16 @@ _mesa_TexEnvfv( GLenum target, GLenum pname, const GLfloat *param )
    }
    else {
       _mesa_error(ctx, GL_INVALID_ENUM, "glTexEnv(target=%s)",
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
    if (MESA_VERBOSE&(VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexEnv %s %s %.1f(%s) ...\n",
-                  _mesa_lookup_enum_by_nr(target),
-                  _mesa_lookup_enum_by_nr(pname),
+                  _mesa_enum_to_string(target),
+                  _mesa_enum_to_string(pname),
                   *param,
-                  _mesa_lookup_enum_by_nr((GLenum) iparam0));
+                  _mesa_enum_to_string((GLenum) iparam0));
 
    /* Tell device driver about the new texture environment */
    if (ctx->Driver.TexEnv) {
diff --git a/src/mesa/main/texformat.c b/src/mesa/main/texformat.c
index 3c4baca7026..f4d17e1bdb5 100644
--- a/src/mesa/main/texformat.c
+++ b/src/mesa/main/texformat.c
@@ -847,7 +847,7 @@ _mesa_choose_tex_format(struct gl_context *ctx, GLenum target,
    }
 
    _mesa_problem(ctx, "unexpected format %s in _mesa_choose_tex_format()",
-                 _mesa_lookup_enum_by_nr(internalFormat));
+                 _mesa_enum_to_string(internalFormat));
    return MESA_FORMAT_NONE;
 }
 
diff --git a/src/mesa/main/texgen.c b/src/mesa/main/texgen.c
index 41e428b69e7..24ba295746a 100644
--- a/src/mesa/main/texgen.c
+++ b/src/mesa/main/texgen.c
@@ -76,10 +76,10 @@ _mesa_TexGenfv( GLenum coord, GLenum pname, const GLfloat *params )
 
    if (MESA_VERBOSE&(VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexGen %s %s %.1f(%s)...\n",
-                  _mesa_lookup_enum_by_nr(coord),
-                  _mesa_lookup_enum_by_nr(pname),
+                  _mesa_enum_to_string(coord),
+                  _mesa_enum_to_string(pname),
                   *params,
-		  _mesa_lookup_enum_by_nr((GLenum) (GLint) *params));
+		  _mesa_enum_to_string((GLenum) (GLint) *params));
 
    if (ctx->Texture.CurrentUnit >= ctx->Const.MaxTextureCoordUnits) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glTexGen(current unit)");
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 92b4d6795c6..c0ccce3d50e 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -75,12 +75,11 @@ type_needs_clamping(GLenum type)
  */
 static void
 get_tex_depth(struct gl_context *ctx, GLuint dimensions,
+              GLint xoffset, GLint yoffset, GLint zoffset,
+              GLsizei width, GLsizei height, GLint depth,
               GLenum format, GLenum type, GLvoid *pixels,
               struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   GLint height = texImage->Height;
-   GLint depth = texImage->Depth;
    GLint img, row;
    GLfloat *depthRow = malloc(width * sizeof(GLfloat));
 
@@ -94,14 +93,15 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
       height = 1;
    }
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint srcRowStride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &srcRowStride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &srcRowStride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -113,7 +113,7 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
             _mesa_pack_depth_span(ctx, width, dest, type, depthRow, &ctx->Pack);
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -130,26 +130,26 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
+                      GLint xoffset, GLint yoffset, GLint zoffset,
+                      GLsizei width, GLsizei height, GLint depth,
                       GLenum format, GLenum type, GLvoid *pixels,
                       struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
    assert(format == GL_DEPTH_STENCIL);
    assert(type == GL_UNSIGNED_INT_24_8 ||
           type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV);
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &rowstride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &rowstride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -166,7 +166,7 @@ get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
             }
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -180,12 +180,11 @@ get_tex_depth_stencil(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
+                GLint xoffset, GLint yoffset, GLint zoffset,
+                GLsizei width, GLsizei height, GLint depth,
                 GLenum format, GLenum type, GLvoid *pixels,
                 struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
    assert(format == GL_STENCIL_INDEX);
@@ -195,8 +194,9 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT,
                                   &srcMap, &rowstride);
 
       if (srcMap) {
@@ -211,7 +211,7 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
                                            dest);
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -226,22 +226,22 @@ get_tex_stencil(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
+              GLint xoffset, GLint yoffset, GLint zoffset,
+              GLsizei width, GLsizei height, GLint depth,
               GLenum format, GLenum type, GLvoid *pixels,
               struct gl_texture_image *texImage)
 {
-   const GLint width = texImage->Width;
-   const GLint height = texImage->Height;
-   const GLint depth = texImage->Depth;
    GLint img, row;
 
+   assert(zoffset + depth <= texImage->Depth);
    for (img = 0; img < depth; img++) {
       GLubyte *srcMap;
       GLint rowstride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
-                                  &srcMap, &rowstride);
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT, &srcMap, &rowstride);
 
       if (srcMap) {
          for (row = 0; row < height; row++) {
@@ -264,7 +264,7 @@ get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
             }
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -279,6 +279,8 @@ get_tex_ycbcr(struct gl_context *ctx, GLuint dimensions,
  */
 static void
 get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
                         GLenum format, GLenum type, GLvoid *pixels,
                         struct gl_texture_image *texImage,
                         GLbitfield transferOps)
@@ -287,9 +289,6 @@ get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
    const mesa_format texFormat =
       _mesa_get_srgb_format_linear(texImage->TexFormat);
    const GLenum baseFormat = _mesa_get_format_base_format(texFormat);
-   const GLuint width = texImage->Width;
-   const GLuint height = texImage->Height;
-   const GLuint depth = texImage->Depth;
    GLfloat *tempImage, *tempSlice;
    GLuint slice;
    int srcStride, dstStride;
@@ -312,15 +311,15 @@ get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
 
       tempSlice = tempImage + slice * 4 * width * height;
 
-      ctx->Driver.MapTextureImage(ctx, texImage, slice,
-                                  0, 0, width, height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + slice,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT,
                                   &srcMap, &srcRowStride);
       if (srcMap) {
          _mesa_decompress_image(texFormat, width, height,
                                 srcMap, srcRowStride, tempSlice);
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, slice);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + slice);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -409,6 +408,8 @@ _mesa_base_pack_format(GLenum format)
  */
 static void
 get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
+                          GLint xoffset, GLint yoffset, GLint zoffset,
+                          GLsizei width, GLsizei height, GLint depth,
                           GLenum format, GLenum type, GLvoid *pixels,
                           struct gl_texture_image *texImage,
                           GLbitfield transferOps)
@@ -416,9 +417,6 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
    /* don't want to apply sRGB -> RGB conversion here so override the format */
    const mesa_format texFormat =
       _mesa_get_srgb_format_linear(texImage->TexFormat);
-   const GLuint width = texImage->Width;
-   GLuint height = texImage->Height;
-   GLuint depth = texImage->Depth;
    GLuint img;
    GLboolean dst_is_integer;
    uint32_t dst_format;
@@ -430,6 +428,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
    if (texImage->TexObject->Target == GL_TEXTURE_1D_ARRAY) {
       depth = height;
       height = 1;
+      zoffset = yoffset;
+      yoffset = 0;
    }
 
    /* Depending on the base format involved we may need to apply a rebase
@@ -449,7 +449,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       rebaseSwizzle[1] = MESA_FORMAT_SWIZZLE_ZERO;
       rebaseSwizzle[2] = MESA_FORMAT_SWIZZLE_ZERO;
       rebaseSwizzle[3] = MESA_FORMAT_SWIZZLE_W;
-    } else if (texImage->_BaseFormat != _mesa_get_format_base_format(texFormat)) {
+    } else if (texImage->_BaseFormat !=
+               _mesa_get_format_base_format(texFormat)) {
       needsRebase =
          _mesa_compute_rgba2base2rgba_component_mapping(texImage->_BaseFormat,
                                                         rebaseSwizzle);
@@ -480,8 +481,9 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       uint32_t src_format;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, img,
-                                  0, 0, width, height, GL_MAP_READ_BIT,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + img,
+                                  xoffset, yoffset, width, height,
+                                  GL_MAP_READ_BIT,
                                   &srcMap, &rowstride);
       if (!srcMap) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -530,8 +532,8 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
          /* If we had to rebase, we have already handled that */
          needsRebase = false;
 
-         /* If we were lucky and our RGBA conversion matches the dst format, then
-          * we are done.
+         /* If we were lucky and our RGBA conversion matches the dst format,
+          * then we are done.
           */
          if (!need_convert)
             goto do_swap;
@@ -568,7 +570,7 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
       }
 
       /* Unmap the src texture buffer */
-      ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+      ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + img);
    }
 
 done:
@@ -583,6 +585,8 @@ done:
  */
 static void
 get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
+             GLint xoffset, GLint yoffset, GLint zoffset,
+             GLsizei width, GLsizei height, GLint depth,
              GLenum format, GLenum type, GLvoid *pixels,
              struct gl_texture_image *texImage)
 {
@@ -604,11 +608,17 @@ get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
    }
 
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
-      get_tex_rgba_compressed(ctx, dimensions, format, type,
+      get_tex_rgba_compressed(ctx, dimensions,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth,
+                              format, type,
                               pixels, texImage, transferOps);
    }
    else {
-      get_tex_rgba_uncompressed(ctx, dimensions, format, type,
+      get_tex_rgba_uncompressed(ctx, dimensions,
+                                xoffset, yoffset, zoffset,
+                                width, height, depth,
+                                format, type,
                                 pixels, texImage, transferOps);
    }
 }
@@ -619,8 +629,10 @@ get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
  * \return GL_TRUE if done, GL_FALSE otherwise
  */
 static GLboolean
-get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
-               GLvoid *pixels,
+get_tex_memcpy(struct gl_context *ctx,
+               GLint xoffset, GLint yoffset, GLint zoffset,
+               GLsizei width, GLsizei height, GLint depth,
+               GLenum format, GLenum type, GLvoid *pixels,
                struct gl_texture_image *texImage)
 {
    const GLenum target = texImage->TexObject->Target;
@@ -642,20 +654,25 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
                                                      ctx->Pack.SwapBytes);
    }
 
+   if (depth > 1) {
+      /* only a single slice is supported at this time */
+      memCopy = FALSE;
+   }
+
    if (memCopy) {
       const GLuint bpp = _mesa_get_format_bytes(texImage->TexFormat);
-      const GLint bytesPerRow = texImage->Width * bpp;
+      const GLint bytesPerRow = width * bpp;
       GLubyte *dst =
-         _mesa_image_address2d(&ctx->Pack, pixels, texImage->Width,
-                               texImage->Height, format, type, 0, 0);
+         _mesa_image_address2d(&ctx->Pack, pixels, width, height,
+                               format, type, 0, 0);
       const GLint dstRowStride =
-         _mesa_image_row_stride(&ctx->Pack, texImage->Width, format, type);
+         _mesa_image_row_stride(&ctx->Pack, width, format, type);
       GLubyte *src;
       GLint srcRowStride;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, 0,
-                                  0, 0, texImage->Width, texImage->Height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT, &src, &srcRowStride);
 
       if (src) {
@@ -664,7 +681,7 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
          }
          else {
             GLuint row;
-            for (row = 0; row < texImage->Height; row++) {
+            for (row = 0; row < height; row++) {
                memcpy(dst, src, bytesPerRow);
                dst += dstRowStride;
                src += srcRowStride;
@@ -672,7 +689,7 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
          }
 
          /* unmap src texture buffer */
-         ctx->Driver.UnmapTextureImage(ctx, texImage, 0);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
@@ -684,15 +701,17 @@ get_tex_memcpy(struct gl_context *ctx, GLenum format, GLenum type,
 
 
 /**
- * This is the software fallback for Driver.GetTexImage().
+ * This is the software fallback for Driver.GetTexSubImage().
  * All error checking will have been done before this routine is called.
  * We'll call ctx->Driver.MapTextureImage() to access the data, then
  * unmap with ctx->Driver.UnmapTextureImage().
  */
 void
-_mesa_GetTexImage_sw(struct gl_context *ctx,
-                     GLenum format, GLenum type, GLvoid *pixels,
-                     struct gl_texture_image *texImage)
+_mesa_GetTexSubImage_sw(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage)
 {
    const GLuint dimensions =
       _mesa_get_texture_dimensions(texImage->TexObject->Target);
@@ -720,23 +739,30 @@ _mesa_GetTexImage_sw(struct gl_context *ctx,
       pixels = ADD_POINTERS(buf, pixels);
    }
 
-   if (get_tex_memcpy(ctx, format, type, pixels, texImage)) {
+   if (get_tex_memcpy(ctx, xoffset, yoffset, zoffset, width, height, depth,
+                      format, type, pixels, texImage)) {
       /* all done */
    }
    else if (format == GL_DEPTH_COMPONENT) {
-      get_tex_depth(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_depth(ctx, dimensions, xoffset, yoffset, zoffset,
+                    width, height, depth, format, type, pixels, texImage);
    }
    else if (format == GL_DEPTH_STENCIL_EXT) {
-      get_tex_depth_stencil(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_depth_stencil(ctx, dimensions, xoffset, yoffset, zoffset,
+                            width, height, depth, format, type, pixels,
+                            texImage);
    }
    else if (format == GL_STENCIL_INDEX) {
-      get_tex_stencil(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_stencil(ctx, dimensions, xoffset, yoffset, zoffset,
+                      width, height, depth, format, type, pixels, texImage);
    }
    else if (format == GL_YCBCR_MESA) {
-      get_tex_ycbcr(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_ycbcr(ctx, dimensions, xoffset, yoffset, zoffset,
+                    width, height, depth, format, type, pixels, texImage);
    }
    else {
-      get_tex_rgba(ctx, dimensions, format, type, pixels, texImage);
+      get_tex_rgba(ctx, dimensions, xoffset, yoffset, zoffset,
+                   width, height, depth, format, type, pixels, texImage);
    }
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
@@ -747,13 +773,16 @@ _mesa_GetTexImage_sw(struct gl_context *ctx,
 
 
 /**
- * This is the software fallback for Driver.GetCompressedTexImage().
+ * This is the software fallback for Driver.GetCompressedTexSubImage().
  * All error checking will have been done before this routine is called.
  */
 void
-_mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
-                               struct gl_texture_image *texImage,
-                               GLvoid *img)
+_mesa_GetCompressedTexSubImage_sw(struct gl_context *ctx,
+                                  struct gl_texture_image *texImage,
+                                  GLint xoffset, GLint yoffset,
+                                  GLint zoffset, GLsizei width,
+                                  GLint height, GLint depth,
+                                  GLvoid *img)
 {
    const GLuint dimensions =
       _mesa_get_texture_dimensions(texImage->TexObject->Target);
@@ -762,10 +791,8 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
    GLubyte *dest;
 
    _mesa_compute_compressed_pixelstore(dimensions, texImage->TexFormat,
-                                       texImage->Width, texImage->Height,
-                                       texImage->Depth,
-                                       &ctx->Pack,
-                                       &store);
+                                       width, height, depth,
+                                       &ctx->Pack, &store);
 
    if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
       /* pack texture image into a PBO */
@@ -791,8 +818,8 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
       GLubyte *src;
 
       /* map src texture buffer */
-      ctx->Driver.MapTextureImage(ctx, texImage, slice,
-                                  0, 0, texImage->Width, texImage->Height,
+      ctx->Driver.MapTextureImage(ctx, texImage, zoffset + slice,
+                                  xoffset, yoffset, width, height,
                                   GL_MAP_READ_BIT, &src, &srcRowStride);
 
       if (src) {
@@ -803,10 +830,11 @@ _mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
             src += srcRowStride;
          }
 
-         ctx->Driver.UnmapTextureImage(ctx, texImage, slice);
+         ctx->Driver.UnmapTextureImage(ctx, texImage, zoffset + slice);
 
          /* Advance to next slice */
-         dest += store.TotalBytesPerRow * (store.TotalRowsPerSlice - store.CopyRowsPerSlice);
+         dest += store.TotalBytesPerRow * (store.TotalRowsPerSlice -
+                                           store.CopyRowsPerSlice);
 
       } else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetCompresssedTexImage");
@@ -863,28 +891,298 @@ legal_getteximage_target(struct gl_context *ctx, GLenum target, bool dsa)
 
 
 /**
- * Do error checking for a glGetTex(ture)Image() call.
- * \return GL_TRUE if any error, GL_FALSE if no errors.
+ * Wrapper for _mesa_select_tex_image() which can handle target being
+ * GL_TEXTURE_CUBE_MAP_ARB in which case we use zoffset to select a cube face.
+ * This can happen for glGetTextureImage and glGetTextureSubImage (DSA
+ * functions).
  */
-static GLboolean
-getteximage_error_check(struct gl_context *ctx,
-                        struct gl_texture_image *texImage,
-                        GLenum target, GLint level,
-                        GLenum format, GLenum type, GLsizei clientMemSize,
-                        GLvoid *pixels, bool dsa)
+static struct gl_texture_image *
+select_tex_image(const struct gl_texture_object *texObj, GLenum target,
+                 GLint level, GLint zoffset)
 {
-   const GLint maxLevels = _mesa_max_texture_levels(ctx, target);
-   const GLuint dimensions = (target == GL_TEXTURE_3D) ? 3 : 2;
-   GLenum baseFormat;
-   const char *suffix = dsa ? "ture" : "";
-
-   assert(texImage);
-   assert(maxLevels != 0);
-   if (level < 0 || level >= maxLevels) {
-      _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glGetTex%sImage(level out of range)", suffix);
-      return GL_TRUE;
+   assert(level >= 0);
+   assert(level < MAX_TEXTURE_LEVELS);
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      assert(zoffset >= 0);
+      assert(zoffset < 6);
+      target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset;
    }
+   return _mesa_select_tex_image(texObj, target, level);
+}
+
+
+/**
+ * Error-check the offset and size arguments to
+ * glGet[Compressed]TextureSubImage().  Also checks if the specified
+ * texture image is missing.
+ * \return true if error, false if no error.
+ */
+static bool
+dimensions_error_check(struct gl_context *ctx,
+                       struct gl_texture_object *texObj,
+                       GLenum target, GLint level,
+                       GLint xoffset, GLint yoffset, GLint zoffset,
+                       GLsizei width, GLsizei height, GLsizei depth,
+                       const char *caller)
+{
+   const struct gl_texture_image *texImage;
+   int i;
+
+   if (xoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(xoffset = %d)", caller, xoffset);
+      return true;
+   }
+
+   if (yoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(yoffset = %d)", caller, yoffset);
+      return true;
+   }
+
+   if (zoffset < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(zoffset = %d)", caller, zoffset);
+      return true;
+   }
+
+   if (width < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(width = %d)", caller, width);
+      return true;
+   }
+
+   if (height < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(height = %d)", caller, height);
+      return true;
+   }
+
+   if (depth < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(depth = %d)", caller, depth);
+      return true;
+   }
+
+   /* do special per-target checks */
+   switch (target) {
+   case GL_TEXTURE_1D:
+      if (yoffset != 0) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(1D, yoffset = %d)", caller, yoffset);
+         return true;
+      }
+      if (height > 1) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(1D, height = %d)", caller, height);
+         return true;
+      }
+      /* fall-through */
+   case GL_TEXTURE_1D_ARRAY:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+      if (zoffset != 0) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset = %d)", caller, zoffset);
+         return true;
+      }
+      if (depth > 1) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(depth = %d)", caller, depth);
+         return true;
+      }
+      break;
+   case GL_TEXTURE_CUBE_MAP:
+      /* Non-array cube maps are special because we have a gl_texture_image
+       * per face.
+       */
+      if (zoffset + depth > 6) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset + depth = %d)", caller, zoffset + depth);
+         return true;
+      }
+      /* check that the range of faces exist */
+      for (i = 0; i < depth; i++) {
+         GLenum face = GL_TEXTURE_CUBE_MAP_POSITIVE_X + zoffset + i;
+         if (!_mesa_select_tex_image(texObj, face, level)) {
+            /* non-existant face */
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(missing cube face)", caller);
+            return true;
+         }
+      }
+      break;
+   default:
+      ; /* nothing */
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   if (!texImage) {
+      /* missing texture image */
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(missing image)", caller);
+      return true;
+   }
+
+   if (xoffset + width > texImage->Width) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(xoffset %d + width %d > %u)",
+                  caller, xoffset, width, texImage->Width);
+      return true;
+   }
+
+   if (yoffset + height > texImage->Height) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(yoffset %d + height %d > %u)",
+                  caller, yoffset, height, texImage->Height);
+      return true;
+   }
+
+   if (target != GL_TEXTURE_CUBE_MAP) {
+      /* Cube map error checking was done above */
+      if (zoffset + depth > texImage->Depth) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(zoffset %d + depth %d > %u)",
+                     caller, zoffset, depth, texImage->Depth);
+         return true;
+      }
+   }
+
+   /* Extra checks for compressed textures */
+   {
+      GLuint bw, bh;
+      _mesa_get_format_block_size(texImage->TexFormat, &bw, &bh);
+      if (bw > 1 || bh > 1) {
+         /* offset must be multiple of block size */
+         if (xoffset % bw != 0) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(xoffset = %d)", caller, xoffset);
+            return true;
+         }
+         if (target != GL_TEXTURE_1D && target != GL_TEXTURE_1D_ARRAY) {
+            if (yoffset % bh != 0) {
+               _mesa_error(ctx, GL_INVALID_VALUE,
+                           "%s(yoffset = %d)", caller, yoffset);
+               return true;
+            }
+         }
+
+         /* The size must be a multiple of bw x bh, or we must be using a
+          * offset+size that exactly hits the edge of the image.
+          */
+         if ((width % bw != 0) &&
+             (xoffset + width != (GLint) texImage->Width)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(width = %d)", caller, width);
+            return true;
+         }
+
+         if ((height % bh != 0) &&
+             (yoffset + height != (GLint) texImage->Height)) {
+            _mesa_error(ctx, GL_INVALID_VALUE,
+                        "%s(height = %d)", caller, height);
+            return true;
+         }
+      }
+   }
+
+   if (width == 0 || height == 0 || depth == 0) {
+      /* Not an error, but nothing to do.  Return 'true' so that the
+       * caller simply returns.
+       */
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Do PBO-related error checking for getting uncompressed images.
+ * \return true if there was an error (or the GetTexImage is to be a no-op)
+ */
+static bool
+pbo_error_check(struct gl_context *ctx, GLenum target,
+                GLsizei width, GLsizei height, GLsizei depth,
+                GLenum format, GLenum type, GLsizei clientMemSize,
+                GLvoid *pixels,
+                const char *caller)
+{
+   const GLuint dimensions = (target == GL_TEXTURE_3D) ? 3 : 2;
+
+   if (!_mesa_validate_pbo_access(dimensions, &ctx->Pack, width, height, depth,
+                                  format, type, clientMemSize, pixels)) {
+      if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(out of bounds PBO access)", caller);
+      } else {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(out of bounds access: bufSize (%d) is too small)",
+                     caller, clientMemSize);
+      }
+      return true;
+   }
+
+   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
+      /* PBO should not be mapped */
+      if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s(PBO is mapped)", caller);
+         return true;
+      }
+   }
+
+   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
+      /* not an error, do nothing */
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Do error checking for all (non-compressed) get-texture-image functions.
+ * \return true if any error, false if no errors.
+ */
+static bool
+getteximage_error_check(struct gl_context *ctx,
+                        struct gl_texture_object *texObj,
+                        GLenum target, GLint level,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLsizei depth,
+                        GLenum format, GLenum type, GLsizei bufSize,
+                        GLvoid *pixels, const char *caller)
+{
+   struct gl_texture_image *texImage;
+   GLenum baseFormat, err;
+   GLint maxLevels;
+
+   assert(texObj);
+
+   if (texObj->Target == 0) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture)", caller);
+      return true;
+   }
+
+   maxLevels = _mesa_max_texture_levels(ctx, target);
+   if (level < 0 || level >= maxLevels) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(level = %d)", caller, level);
+      return true;
+   }
+
+   err = _mesa_error_check_format_and_type(ctx, format, type);
+   if (err != GL_NO_ERROR) {
+      _mesa_error(ctx, err, "%s(format/type)", caller);
+      return true;
+   }
+
+   if (dimensions_error_check(ctx, texObj, target, level,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth, caller)) {
+      return true;
+   }
+
+   if (pbo_error_check(ctx, target, width, height, depth,
+                       format, type, bufSize, pixels, caller)) {
+      return true;
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);
 
    /*
     * Format and type checking has been moved up to GetnTexImage and
@@ -899,494 +1197,579 @@ getteximage_error_check(struct gl_context *ctx,
    if (_mesa_is_color_format(format)
        && !_mesa_is_color_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_depth_format(format)
             && !_mesa_is_depth_format(baseFormat)
             && !_mesa_is_depthstencil_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_stencil_format(format)
             && !ctx->Extensions.ARB_texture_stencil8) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetTex%sImage(format=GL_STENCIL_INDEX)", suffix);
-      return GL_TRUE;
+                  "%s(format=GL_STENCIL_INDEX)", caller);
+      return true;
    }
    else if (_mesa_is_ycbcr_format(format)
             && !_mesa_is_ycbcr_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
    else if (_mesa_is_depthstencil_format(format)
             && !_mesa_is_depthstencil_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
-   else if (!_mesa_is_stencil_format(format) && _mesa_is_enum_format_integer(format) !=
+   else if (!_mesa_is_stencil_format(format) &&
+            _mesa_is_enum_format_integer(format) !=
             _mesa_is_format_integer(texImage->TexFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetTex%sImage(format mismatch)", suffix);
-      return GL_TRUE;
+                  "%s(format mismatch)", caller);
+      return true;
    }
 
-   if (!_mesa_validate_pbo_access(dimensions, &ctx->Pack, texImage->Width,
-                                  texImage->Height, texImage->Depth,
-                                  format, type, clientMemSize, pixels)) {
-      if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTex%sImage(out of bounds PBO access)", suffix);
-      } else {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(out of bounds access:"
-                     " bufSize (%d) is too small)",
-                     dsa ? "glGetTextureImage" : "glGetnTexImageARB",
-                     clientMemSize);
-      }
-      return GL_TRUE;
-   }
-
-   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      /* PBO should not be mapped */
-      if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTex%sImage(PBO is mapped)", suffix);
-         return GL_TRUE;
-      }
-   }
-
-   return GL_FALSE;
+   return false;
 }
 
 
 /**
- * This is the implementation for glGetnTexImageARB, glGetTextureImage,
- * and glGetTexImage.
- *
- * Requires caller to pass in texImage object because _mesa_GetTextureImage
- * must handle the GL_TEXTURE_CUBE_MAP target.
- *
- * \param target texture target.
+ * Return the width, height and depth of a texture image.
+ * This function must be resilient to bad parameter values since
+ * this is called before full error checking.
+ */
+static void
+get_texture_image_dims(const struct gl_texture_object *texObj,
+                       GLenum target, GLint level,
+                       GLsizei *width, GLsizei *height, GLsizei *depth)
+{
+   const struct gl_texture_image *texImage = NULL;
+
+   if (level >= 0 && level < MAX_TEXTURE_LEVELS) {
+      texImage = _mesa_select_tex_image(texObj, target, level);
+   }
+
+   if (texImage) {
+      *width = texImage->Width;
+      *height = texImage->Height;
+      if (target == GL_TEXTURE_CUBE_MAP) {
+         *depth = 6;
+      }
+      else {
+         *depth = texImage->Depth;
+      }
+   }
+   else {
+      *width = *height = *depth = 0;
+   }
+}
+
+
+/**
+ * Common code for all (uncompressed) get-texture-image functions.
+ * \param texObj  the texture object (should not be null)
+ * \param target  user-provided target, or 0 for DSA
  * \param level image level.
  * \param format pixel data format for returned image.
  * \param type pixel data type for returned image.
  * \param bufSize size of the pixels data buffer.
  * \param pixels returned pixel data.
- * \param dsa True when the caller is an ARB_direct_state_access function,
- *            false otherwise
+ * \param caller  name of calling function
  */
-void
-_mesa_get_texture_image(struct gl_context *ctx,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage, GLenum target,
-                        GLint level, GLenum format, GLenum type,
-                        GLsizei bufSize, GLvoid *pixels, bool dsa)
+static void
+get_texture_image(struct gl_context *ctx,
+                  struct gl_texture_object *texObj,
+                  GLenum target, GLint level,
+                  GLint xoffset, GLint yoffset, GLint zoffset,
+                  GLsizei width, GLsizei height, GLint depth,
+                  GLenum format, GLenum type,
+                  GLvoid *pixels, const char *caller)
 {
-   assert(texObj);
-   assert(texImage);
+   struct gl_texture_image *texImage;
+   unsigned firstFace, numFaces, i;
+   GLint imageStride;
 
    FLUSH_VERTICES(ctx, 0);
 
-   /*
-    * Legal target checking has been moved up to GetnTexImage and
-    * GetTextureImage so that it can be caught before receiving a NULL
-    * texImage object and exiting.
-    */
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);  /* should have been error checked already */
 
-   if (getteximage_error_check(ctx, texImage, target, level, format,
-                               type, bufSize, pixels, dsa)) {
+   if (_mesa_is_zero_size_texture(texImage)) {
+      /* no image data to return */
       return;
    }
 
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
-      /* not an error, do nothing */
-      return;
-   }
-
-   if (_mesa_is_zero_size_texture(texImage))
-      return;
-
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE)) {
-      _mesa_debug(ctx, "glGetTex%sImage(tex %u) format = %s, w=%d, h=%d,"
+      _mesa_debug(ctx, "%s(tex %u) format = %s, w=%d, h=%d,"
                   " dstFmt=0x%x, dstType=0x%x\n",
-                  dsa ? "ture": "",
-                  texObj->Name,
+                  caller, texObj->Name,
                   _mesa_get_format_name(texImage->TexFormat),
                   texImage->Width, texImage->Height,
                   format, type);
    }
 
-   _mesa_lock_texture(ctx, texObj);
-   {
-      ctx->Driver.GetTexImage(ctx, format, type, pixels, texImage);
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      /* Compute stride between cube faces */
+      imageStride = _mesa_image_image_stride(&ctx->Pack, width, height,
+                                             format, type);
+      firstFace = zoffset;
+      numFaces = depth;
+      zoffset = 0;
+      depth = 1;
    }
+   else {
+      imageStride = 0;
+      firstFace = _mesa_tex_target_to_face(target);
+      numFaces = 1;
+   }
+
+   _mesa_lock_texture(ctx, texObj);
+
+   for (i = 0; i < numFaces; i++) {
+      texImage = texObj->Image[firstFace + i][level];
+      assert(texImage);
+
+      ctx->Driver.GetTexSubImage(ctx, xoffset, yoffset, zoffset,
+                                 width, height, depth,
+                                 format, type, pixels, texImage);
+
+      /* next cube face */
+      pixels = (GLubyte *) pixels + imageStride;
+   }
+
    _mesa_unlock_texture(ctx, texObj);
 }
 
-/**
- * Get texture image.  Called by glGetTexImage.
- *
- * \param target texture target.
- * \param level image level.
- * \param format pixel data format for returned image.
- * \param type pixel data type for returned image.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
+
 void GLAPIENTRY
-_mesa_GetnTexImageARB(GLenum target, GLint level, GLenum format,
-                      GLenum type, GLsizei bufSize, GLvoid *pixels)
+_mesa_GetnTexImageARB(GLenum target, GLint level, GLenum format, GLenum type,
+                      GLsizei bufSize, GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   GLenum err;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetnTexImageARB";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
 
-   /*
-    * This has been moved here because a format/type mismatch can cause a NULL
-    * texImage object, which in turn causes the mismatch error to be
-    * ignored.
-    */
-   err = _mesa_error_check_format_and_type(ctx, format, type);
-   if (err != GL_NO_ERROR) {
-      _mesa_error(ctx, err, "glGetnTexImage(format/type)");
-      return;
-   }
-
-   /*
-    * Legal target checking has been moved here to prevent exiting with a NULL
-    * texImage object.
-    */
    if (!legal_getteximage_target(ctx, target, false)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetnTexImage(target=0x%x)",
-                  target);
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
       return;
    }
 
    texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
-      return;
+   assert(texObj);
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   if (!texImage)
-      return;
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
 
-   _mesa_get_texture_image(ctx, texObj, texImage, target, level, format, type,
-                           bufSize, pixels, false);
+   if (getteximage_error_check(ctx, texObj, target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
+      return;
+   }
+
+   get_texture_image(ctx, texObj, target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
 }
 
 
 void GLAPIENTRY
-_mesa_GetTexImage( GLenum target, GLint level, GLenum format,
-                   GLenum type, GLvoid *pixels )
+_mesa_GetTexImage(GLenum target, GLint level, GLenum format, GLenum type,
+                  GLvoid *pixels )
 {
-   _mesa_GetnTexImageARB(target, level, format, type, INT_MAX, pixels);
-}
-
-/**
- * Get texture image.
- *
- * \param texture texture name.
- * \param level image level.
- * \param format pixel data format for returned image.
- * \param type pixel data type for returned image.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
-void GLAPIENTRY
-_mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
-                      GLenum type, GLsizei bufSize, GLvoid *pixels)
-{
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   int i;
-   GLint image_stride;
-   GLenum err;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetTexImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
 
-   /*
-    * This has been moved here because a format/type mismatch can cause a NULL
-    * texImage object, which in turn causes the mismatch error to be
-    * ignored.
-    */
-   err = _mesa_error_check_format_and_type(ctx, format, type);
-   if (err != GL_NO_ERROR) {
-      _mesa_error(ctx, err, "glGetTextureImage(format/type)");
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
       return;
    }
 
-   texObj = _mesa_lookup_texture_err(ctx, texture, "glGetTextureImage");
-   if (!texObj)
-      return;
+   texObj = _mesa_get_current_tex_object(ctx, target);
+   assert(texObj);
 
-   /*
-    * Legal target checking has been moved here to prevent exiting with a NULL
-    * texImage object.
-    */
-   if (!legal_getteximage_target(ctx, texObj->Target, true)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGetTextureImage(target=%s)",
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
+
+   if (getteximage_error_check(ctx, texObj, target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, INT_MAX, pixels, caller)) {
       return;
    }
 
-   /* Must handle special case GL_TEXTURE_CUBE_MAP. */
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
-
-      /* Make sure the texture object is a proper cube.
-       * (See texturesubimage in teximage.c for details on why this check is
-       * performed.)
-       */
-      if (!_mesa_cube_level_complete(texObj, level)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetTextureImage(cube map incomplete)");
-         return;
-      }
-
-      /* Copy each face. */
-      for (i = 0; i < 6; ++i) {
-         texImage = texObj->Image[i][level];
-         assert(texImage);
-
-         _mesa_get_texture_image(ctx, texObj, texImage, texObj->Target, level,
-                                 format, type, bufSize, pixels, true);
-
-         image_stride = _mesa_image_image_stride(&ctx->Pack, texImage->Width,
-                                                 texImage->Height, format,
-                                                 type);
-         pixels = (GLubyte *) pixels + image_stride;
-         bufSize -= image_stride;
-      }
-   }
-   else {
-      texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
-      if (!texImage)
-         return;
-
-      _mesa_get_texture_image(ctx, texObj, texImage, texObj->Target, level,
-                              format, type, bufSize, pixels, true);
-   }
+   get_texture_image(ctx, texObj, target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
 }
 
-/**
- * Do error checking for a glGetCompressedTexImage() call.
- * \return GL_TRUE if any error, GL_FALSE if no errors.
- */
-static GLboolean
-getcompressedteximage_error_check(struct gl_context *ctx,
-                                  struct gl_texture_image *texImage,
-                                  GLenum target,
-                                  GLint level, GLsizei clientMemSize,
-                                  GLvoid *img, bool dsa)
+
+void GLAPIENTRY
+_mesa_GetTextureImage(GLuint texture, GLint level, GLenum format, GLenum type,
+                      GLsizei bufSize, GLvoid *pixels)
 {
-   const GLint maxLevels = _mesa_max_texture_levels(ctx, target);
-   GLuint compressedSize, dimensions;
-   const char *suffix = dsa ? "ture" : "";
+   GET_CURRENT_CONTEXT(ctx);
+   GLsizei width, height, depth;
+   static const char *caller = "glGetTextureImage";
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
 
-   assert(texImage);
-
-   if (!legal_getteximage_target(ctx, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetCompressedTex%sImage(target=%s)", suffix,
-                  _mesa_lookup_enum_by_nr(target));
-      return GL_TRUE;
+   if (!texObj) {
+      return;
    }
 
-   assert(maxLevels != 0);
+   get_texture_image_dims(texObj, texObj->Target, level,
+                          &width, &height, &depth);
+
+   if (getteximage_error_check(ctx, texObj, texObj->Target, level,
+                               0, 0, 0, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
+      return;
+   }
+
+   get_texture_image(ctx, texObj, texObj->Target, level,
+                     0, 0, 0, width, height, depth,
+                     format, type, pixels, caller);
+}
+
+
+void GLAPIENTRY
+_mesa_GetTextureSubImage(GLuint texture, GLint level,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLsizei depth,
+                         GLenum format, GLenum type, GLsizei bufSize,
+                         void *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetTextureSubImage";
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
+
+   if (!texObj) {
+      return;
+   }
+
+   if (getteximage_error_check(ctx, texObj, texObj->Target, level,
+                               xoffset, yoffset, zoffset, width, height, depth,
+                               format, type, bufSize, pixels, caller)) {
+      return;
+   }
+
+   get_texture_image(ctx, texObj, texObj->Target, level,
+                     xoffset, yoffset, zoffset, width, height, depth,
+                     format, type, pixels, caller);
+}
+
+
+
+/**
+ * Compute the number of bytes which will be written when retrieving
+ * a sub-region of a compressed texture.
+ */
+static GLsizei
+packed_compressed_size(GLuint dimensions, mesa_format format,
+                       GLsizei width, GLsizei height, GLsizei depth,
+                       const struct gl_pixelstore_attrib *packing)
+{
+   struct compressed_pixelstore st;
+   GLsizei totalBytes;
+
+   _mesa_compute_compressed_pixelstore(dimensions, format,
+                                       width, height, depth,
+                                       packing, &st);
+   totalBytes =
+      (st.CopySlices - 1) * st.TotalRowsPerSlice * st.TotalBytesPerRow +
+      st.SkipBytes +
+      (st.CopyRowsPerSlice - 1) * st.TotalBytesPerRow +
+      st.CopyBytesPerRow;
+
+   return totalBytes;
+}
+
+
+/**
+ * Do error checking for getting compressed texture images.
+ * \return true if any error, false if no errors.
+ */
+static bool
+getcompressedteximage_error_check(struct gl_context *ctx,
+                                  struct gl_texture_object *texObj,
+                                  GLenum target, GLint level,
+                                  GLint xoffset, GLint yoffset, GLint zoffset,
+                                  GLsizei width, GLsizei height, GLsizei depth,
+                                  GLsizei bufSize, GLvoid *pixels,
+                                  const char *caller)
+{
+   struct gl_texture_image *texImage;
+   GLint maxLevels;
+   GLsizei totalBytes;
+   GLuint dimensions;
+
+   assert(texObj);
+
+   if (texObj->Target == 0) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture)", caller);
+      return true;
+   }
+
+   maxLevels = _mesa_max_texture_levels(ctx, target);
    if (level < 0 || level >= maxLevels) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glGetCompressedTex%sImage(bad level = %d)", suffix, level);
-      return GL_TRUE;
+                  "%s(bad level = %d)", caller, level);
+      return true;
    }
 
+   if (dimensions_error_check(ctx, texObj, target, level,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth, caller)) {
+      return true;
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);
+
    if (!_mesa_is_format_compressed(texImage->TexFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glGetCompressedTex%sImage(texture is not compressed)",
-                  suffix);
-      return GL_TRUE;
+                  "%s(texture is not compressed)", caller);
+      return true;
    }
 
-   compressedSize = _mesa_format_image_size(texImage->TexFormat,
-                                            texImage->Width,
-                                            texImage->Height,
-                                            texImage->Depth);
-
    /* Check for invalid pixel storage modes */
-   dimensions = _mesa_get_texture_dimensions(texImage->TexObject->Target);
+   dimensions = _mesa_get_texture_dimensions(texObj->Target);
    if (!_mesa_compressed_pixel_storage_error_check(ctx, dimensions,
-                                              &ctx->Pack, dsa ?
-                                              "glGetCompressedTextureImage":
-                                              "glGetCompressedTexImage")) {
-      return GL_TRUE;
+                                                   &ctx->Pack,
+                                                   caller)) {
+      return true;
    }
 
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      /* do bounds checking on writing to client memory */
-      if (clientMemSize < (GLsizei) compressedSize) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(out of bounds access: bufSize (%d) is too small)",
-                     dsa ? "glGetCompressedTextureImage" :
-                     "glGetnCompressedTexImageARB", clientMemSize);
-         return GL_TRUE;
-      }
-   } else {
+   /* Compute number of bytes that may be touched in the dest buffer */
+   totalBytes = packed_compressed_size(dimensions, texImage->TexFormat,
+                                       width, height, depth,
+                                       &ctx->Pack);
+
+   /* Do dest buffer bounds checking */
+   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
       /* do bounds checking on PBO write */
-      if ((const GLubyte *) img + compressedSize >
-          (const GLubyte *) ctx->Pack.BufferObj->Size) {
+      if ((GLubyte *) pixels + totalBytes >
+          (GLubyte *) ctx->Pack.BufferObj->Size) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTex%sImage(out of bounds PBO access)",
-                     suffix);
-         return GL_TRUE;
+                     "%s(out of bounds PBO access)", caller);
+         return true;
       }
 
       /* make sure PBO is not mapped */
       if (_mesa_check_disallowed_mapping(ctx->Pack.BufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(PBO is mapped)", caller);
+         return true;
+      }
+   }
+   else {
+      /* do bounds checking on writing to client memory */
+      if (totalBytes > bufSize) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTex%sImage(PBO is mapped)", suffix);
-         return GL_TRUE;
+                     "%s(out of bounds access: bufSize (%d) is too small)",
+                     caller, bufSize);
+         return true;
       }
    }
 
-   return GL_FALSE;
+   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
+      /* not an error, but do nothing */
+      return true;
+   }
+
+   return false;
 }
 
-/** Implements glGetnCompressedTexImageARB, glGetCompressedTexImage, and
- * glGetCompressedTextureImage.
- *
- * texImage must be passed in because glGetCompressedTexImage must handle the
- * target GL_TEXTURE_CUBE_MAP.
+
+/**
+ * Common helper for all glGetCompressed-teximage functions.
  */
-void
-_mesa_get_compressed_texture_image(struct gl_context *ctx,
-                                   struct gl_texture_object *texObj,
-                                   struct gl_texture_image *texImage,
-                                   GLenum target, GLint level,
-                                   GLsizei bufSize, GLvoid *pixels,
-                                   bool dsa)
+static void
+get_compressed_texture_image(struct gl_context *ctx,
+                             struct gl_texture_object *texObj,
+                             GLenum target, GLint level,
+                             GLint xoffset, GLint yoffset, GLint zoffset,
+                             GLsizei width, GLsizei height, GLint depth,
+                             GLvoid *pixels,
+                             const char *caller)
 {
-   assert(texObj);
-   assert(texImage);
+   struct gl_texture_image *texImage;
+   unsigned firstFace, numFaces, i, imageStride;
 
    FLUSH_VERTICES(ctx, 0);
 
-   if (getcompressedteximage_error_check(ctx, texImage, target, level,
-                                         bufSize, pixels, dsa)) {
-      return;
-   }
-
-   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && !pixels) {
-      /* not an error, do nothing */
-      return;
-   }
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   assert(texImage);  /* should have been error checked already */
 
    if (_mesa_is_zero_size_texture(texImage))
       return;
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE)) {
       _mesa_debug(ctx,
-                  "glGetCompressedTex%sImage(tex %u) format = %s, w=%d, h=%d\n",
-                  dsa ? "ture" : "", texObj->Name,
+                  "%s(tex %u) format = %s, w=%d, h=%d\n",
+                  caller, texObj->Name,
                   _mesa_get_format_name(texImage->TexFormat),
                   texImage->Width, texImage->Height);
    }
 
-   _mesa_lock_texture(ctx, texObj);
-   {
-      ctx->Driver.GetCompressedTexImage(ctx, texImage, pixels);
+   if (target == GL_TEXTURE_CUBE_MAP) {
+      struct compressed_pixelstore store;
+
+      /* Compute image stride between cube faces */
+      _mesa_compute_compressed_pixelstore(2, texImage->TexFormat,
+                                          width, height, depth,
+                                          &ctx->Pack, &store);
+      imageStride = store.TotalBytesPerRow * store.TotalRowsPerSlice;
+
+      firstFace = zoffset;
+      numFaces = depth;
+      zoffset = 0;
+      depth = 1;
    }
+   else {
+      imageStride = 0;
+      firstFace = _mesa_tex_target_to_face(target);
+      numFaces = 1;
+   }
+
+   _mesa_lock_texture(ctx, texObj);
+
+   for (i = 0; i < numFaces; i++) {
+      texImage = texObj->Image[firstFace + i][level];
+      assert(texImage);
+
+      ctx->Driver.GetCompressedTexSubImage(ctx, texImage,
+                                           xoffset, yoffset, zoffset,
+                                           width, height, depth, pixels);
+
+      /* next cube face */
+      pixels = (GLubyte *) pixels + imageStride;
+   }
+
    _mesa_unlock_texture(ctx, texObj);
 }
 
+
 void GLAPIENTRY
 _mesa_GetnCompressedTexImageARB(GLenum target, GLint level, GLsizei bufSize,
-                                GLvoid *img)
+                                GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetnCompressedTexImageARB";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
+
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
+      return;
+   }
 
    texObj = _mesa_get_current_tex_object(ctx, target);
-   if (!texObj)
-      return;
+   assert(texObj);
 
-   texImage = _mesa_select_tex_image(texObj, target, level);
-   if (!texImage)
-      return;
+   get_texture_image_dims(texObj, target, level, &width, &height, &depth);
 
-   _mesa_get_compressed_texture_image(ctx, texObj, texImage, target, level,
-                                      bufSize, img, false);
+   if (getcompressedteximage_error_check(ctx, texObj, target, level,
+                                         0, 0, 0, width, height, depth,
+                                         INT_MAX, pixels, caller)) {
+      return;
+   }
+
+   get_compressed_texture_image(ctx, texObj, target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
 }
 
+
 void GLAPIENTRY
-_mesa_GetCompressedTexImage(GLenum target, GLint level, GLvoid *img)
+_mesa_GetCompressedTexImage(GLenum target, GLint level, GLvoid *pixels)
 {
-   _mesa_GetnCompressedTexImageARB(target, level, INT_MAX, img);
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTexImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj;
+
+   if (!legal_getteximage_target(ctx, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s", caller);
+      return;
+   }
+
+   texObj = _mesa_get_current_tex_object(ctx, target);
+   assert(texObj);
+
+   get_texture_image_dims(texObj, target, level,
+                          &width, &height, &depth);
+
+   if (getcompressedteximage_error_check(ctx, texObj, target, level,
+                                         0, 0, 0, width, height, depth,
+                                         INT_MAX, pixels, caller)) {
+      return;
+   }
+
+   get_compressed_texture_image(ctx, texObj, target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
 }
 
-/**
- * Get compressed texture image.
- *
- * \param texture texture name.
- * \param level image level.
- * \param bufSize size of the pixels data buffer.
- * \param pixels returned pixel data.
- */
+
 void GLAPIENTRY
 _mesa_GetCompressedTextureImage(GLuint texture, GLint level,
                                 GLsizei bufSize, GLvoid *pixels)
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   int i;
-   GLint image_stride;
    GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTextureImage";
+   GLsizei width, height, depth;
+   struct gl_texture_object *texObj =
+      _mesa_lookup_texture_err(ctx, texture, caller);
 
-   texObj = _mesa_lookup_texture_err(ctx, texture,
-                                     "glGetCompressedTextureImage");
-   if (!texObj)
+   if (!texObj) {
       return;
-
-   /* Must handle special case GL_TEXTURE_CUBE_MAP. */
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
-
-      /* Make sure the texture object is a proper cube.
-       * (See texturesubimage in teximage.c for details on why this check is
-       * performed.)
-       */
-      if (!_mesa_cube_level_complete(texObj, level)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetCompressedTextureImage(cube map incomplete)");
-         return;
-      }
-
-      /* Copy each face. */
-      for (i = 0; i < 6; ++i) {
-         texImage = texObj->Image[i][level];
-         assert(texImage);
-
-         _mesa_get_compressed_texture_image(ctx, texObj, texImage,
-                                            texObj->Target, level,
-                                            bufSize, pixels, true);
-
-         /* Compressed images don't have a client format */
-         image_stride = _mesa_format_image_size(texImage->TexFormat,
-                                                texImage->Width,
-                                                texImage->Height, 1);
-
-         pixels = (GLubyte *) pixels + image_stride;
-         bufSize -= image_stride;
-      }
    }
-   else {
-      texImage = _mesa_select_tex_image(texObj, texObj->Target, level);
-      if (!texImage)
-         return;
 
-      _mesa_get_compressed_texture_image(ctx, texObj, texImage,
-                                         texObj->Target, level, bufSize,
-                                         pixels, true);
+   get_texture_image_dims(texObj, texObj->Target, level,
+                          &width, &height, &depth);
+
+   if (getcompressedteximage_error_check(ctx, texObj, texObj->Target, level,
+                                         0, 0, 0, width, height, depth,
+                                         bufSize, pixels, caller)) {
+      return;
    }
+
+   get_compressed_texture_image(ctx, texObj, texObj->Target, level,
+                                0, 0, 0, width, height, depth,
+                                pixels, caller);
+}
+
+
+void APIENTRY
+_mesa_GetCompressedTextureSubImage(GLuint texture, GLint level,
+                                   GLint xoffset, GLint yoffset,
+                                   GLint zoffset, GLsizei width,
+                                   GLsizei height, GLsizei depth,
+                                   GLsizei bufSize, void *pixels)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   static const char *caller = "glGetCompressedTextureImage";
+   struct gl_texture_object *texObj;
+
+   texObj = _mesa_lookup_texture_err(ctx, texture, caller);
+   if (!texObj) {
+      return;
+   }
+
+   if (getcompressedteximage_error_check(ctx, texObj, texObj->Target, level,
+                                         xoffset, yoffset, zoffset,
+                                         width, height, depth,
+                                         bufSize, pixels, caller)) {
+      return;
+   }
+
+   get_compressed_texture_image(ctx, texObj, texObj->Target, level,
+                                xoffset, yoffset, zoffset,
+                                width, height, depth,
+                                pixels, caller);
 }
diff --git a/src/mesa/main/texgetimage.h b/src/mesa/main/texgetimage.h
index 1fa2f59dcdc..63c75eb931d 100644
--- a/src/mesa/main/texgetimage.h
+++ b/src/mesa/main/texgetimage.h
@@ -37,22 +37,19 @@ extern GLenum
 _mesa_base_pack_format(GLenum format);
 
 extern void
-_mesa_GetTexImage_sw(struct gl_context *ctx,
-                     GLenum format, GLenum type, GLvoid *pixels,
-                     struct gl_texture_image *texImage);
-
+_mesa_GetTexSubImage_sw(struct gl_context *ctx,
+                        GLint xoffset, GLint yoffset, GLint zoffset,
+                        GLsizei width, GLsizei height, GLint depth,
+                        GLenum format, GLenum type, GLvoid *pixels,
+                        struct gl_texture_image *texImage);
 
 extern void
-_mesa_GetCompressedTexImage_sw(struct gl_context *ctx,
-                               struct gl_texture_image *texImage,
-                               GLvoid *data);
-
-extern void
-_mesa_get_texture_image(struct gl_context *ctx,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage, GLenum target,
-                        GLint level, GLenum format, GLenum type,
-                        GLsizei bufSize, GLvoid *pixels, bool dsa);
+_mesa_GetCompressedTexSubImage_sw(struct gl_context *ctx,
+                                  struct gl_texture_image *texImage,
+                                  GLint xoffset, GLint yoffset,
+                                  GLint zoffset, GLsizei width,
+                                  GLint height, GLint depth,
+                                  GLvoid *data);
 
 extern void
 _mesa_get_compressed_texture_image( struct gl_context *ctx,
@@ -73,6 +70,14 @@ extern void GLAPIENTRY
 _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format,
                       GLenum type, GLsizei bufSize, GLvoid *pixels);
 
+extern void GLAPIENTRY
+_mesa_GetTextureSubImage(GLuint texture, GLint level,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLsizei depth,
+                         GLenum format, GLenum type, GLsizei bufSize,
+                         void *pixels);
+
+
 extern void GLAPIENTRY
 _mesa_GetCompressedTexImage(GLenum target, GLint lod, GLvoid *img);
 
@@ -84,4 +89,11 @@ extern void GLAPIENTRY
 _mesa_GetCompressedTextureImage(GLuint texture, GLint level, GLsizei bufSize,
                                 GLvoid *pixels);
 
+extern void APIENTRY
+_mesa_GetCompressedTextureSubImage(GLuint texture, GLint level,
+                                   GLint xoffset, GLint yoffset,
+                                   GLint zoffset, GLsizei width,
+                                   GLsizei height, GLsizei depth,
+                                   GLsizei bufSize, void *pixels);
+
 #endif /* TEXGETIMAGE_H */
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 3d85615fa45..3a556a6ad6e 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1008,7 +1008,7 @@ _mesa_max_texture_levels(struct gl_context *ctx, GLenum target)
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      return _mesa_is_desktop_gl(ctx)
+      return (_mesa_is_desktop_gl(ctx) || _mesa_is_gles31(ctx))
          && ctx->Extensions.ARB_texture_multisample
          ? 1 : 0;
    case GL_TEXTURE_EXTERNAL_OES:
@@ -1793,8 +1793,6 @@ GLboolean
 _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
                                GLenum intFormat)
 {
-   (void) intFormat;  /* not used yet */
-
    switch (target) {
    case GL_TEXTURE_2D:
    case GL_PROXY_TEXTURE_2D:
@@ -1814,6 +1812,16 @@ _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
    case GL_PROXY_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_CUBE_MAP_ARRAY:
       return ctx->Extensions.ARB_texture_cube_map_array;
+   case GL_TEXTURE_3D:
+      switch (intFormat) {
+      case GL_COMPRESSED_RGBA_BPTC_UNORM:
+      case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+      case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+      case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+         return ctx->Extensions.ARB_texture_compression_bptc;
+      default:
+         return GL_FALSE;
+      }
    default:
       return GL_FALSE;
    }
@@ -2080,6 +2088,53 @@ texture_formats_agree(GLenum internalFormat,
    return true;
 }
 
+/**
+ * Test the combination of format, type and internal format arguments of
+ * different texture operations on GLES.
+ *
+ * \param ctx GL context.
+ * \param format pixel data format given by the user.
+ * \param type pixel data type given by the user.
+ * \param internalFormat internal format given by the user.
+ * \param dimensions texture image dimensions (must be 1, 2 or 3).
+ * \param callerName name of the caller function to print in the error message
+ *
+ * \return true if a error is found, false otherwise
+ *
+ * Currently, it is used by texture_error_check() and texsubimage_error_check().
+ */
+static bool
+texture_format_error_check_gles(struct gl_context *ctx, GLenum format,
+                                GLenum type, GLenum internalFormat,
+                                GLuint dimensions, const char *callerName)
+{
+   GLenum err;
+
+   if (_mesa_is_gles3(ctx)) {
+      err = _mesa_es3_error_check_format_and_type(ctx, format, type,
+                                                  internalFormat);
+      if (err != GL_NO_ERROR) {
+         _mesa_error(ctx, err,
+                     "%s(format = %s, type = %s, internalformat = %s)",
+                     callerName, _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type),
+                     _mesa_enum_to_string(internalFormat));
+         return true;
+      }
+   }
+   else {
+      err = _mesa_es_error_check_format_and_type(format, type, dimensions);
+      if (err != GL_NO_ERROR) {
+         _mesa_error(ctx, err, "%s(format = %s, type = %s)",
+                     callerName, _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type));
+         return true;
+      }
+   }
+
+   return false;
+}
+
 /**
  * Test the glTexImage[123]D() parameters for errors.
  *
@@ -2151,39 +2206,17 @@ texture_error_check( struct gl_context *ctx,
     * Formats and types that require additional extensions (e.g., GL_FLOAT
     * requires GL_OES_texture_float) are filtered elsewhere.
     */
-
-   if (_mesa_is_gles(ctx)) {
-      if (_mesa_is_gles3(ctx)) {
-         err = _mesa_es3_error_check_format_and_type(ctx, format, type,
-                                                     internalFormat);
-      } else {
-         if (format != internalFormat) {
-            _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glTexImage%dD(format = %s, internalFormat = %s)",
-                        dimensions,
-                        _mesa_lookup_enum_by_nr(format),
-                        _mesa_lookup_enum_by_nr(internalFormat));
-            return GL_TRUE;
-         }
-
-         err = _mesa_es_error_check_format_and_type(format, type, dimensions);
-      }
-      if (err != GL_NO_ERROR) {
-         _mesa_error(ctx, err,
-                     "glTexImage%dD(format = %s, type = %s, internalFormat = %s)",
-                     dimensions,
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type),
-                     _mesa_lookup_enum_by_nr(internalFormat));
-         return GL_TRUE;
-      }
+   if (_mesa_is_gles(ctx) &&
+       texture_format_error_check_gles(ctx, format, type, internalFormat,
+                                       dimensions, "glTexImage%dD")) {
+     return GL_TRUE;
    }
 
    /* Check internalFormat */
    if (_mesa_base_tex_format(ctx, internalFormat) < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glTexImage%dD(internalFormat=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat));
+                  dimensions, _mesa_enum_to_string(internalFormat));
       return GL_TRUE;
    }
 
@@ -2192,8 +2225,8 @@ texture_error_check( struct gl_context *ctx,
    if (err != GL_NO_ERROR) {
       _mesa_error(ctx, err,
                   "glTexImage%dD(incompatible format = %s, type = %s)",
-                  dimensions, _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  dimensions, _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return GL_TRUE;
    }
 
@@ -2208,8 +2241,8 @@ texture_error_check( struct gl_context *ctx,
    if (!texture_formats_agree(internalFormat, format)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glTexImage%dD(incompatible internalFormat = %s, format = %s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat),
-                  _mesa_lookup_enum_by_nr(format));
+                  dimensions, _mesa_enum_to_string(internalFormat),
+                  _mesa_enum_to_string(format));
       return GL_TRUE;
    }
 
@@ -2324,7 +2357,7 @@ compressed_texture_error_check(struct gl_context *ctx, GLint dimensions,
    if (!_mesa_is_compressed_format(ctx, internalFormat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCompressedTexImage%dD(internalFormat=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(internalFormat));
+                  dimensions, _mesa_enum_to_string(internalFormat));
       return GL_TRUE;
    }
 
@@ -2479,50 +2512,12 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   /* check target (proxies not allowed) */
-   if (!legal_texsubimage_target(ctx, dimensions, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%s)",
-                  callerName, _mesa_lookup_enum_by_nr(target));
-      return GL_TRUE;
-   }
-
    /* level check */
    if (level < 0 || level >= _mesa_max_texture_levels(ctx, target)) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(level=%d)", callerName, level);
       return GL_TRUE;
    }
 
-   /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
-    * combinations of format and type that can be used.  Formats and types
-    * that require additional extensions (e.g., GL_FLOAT requires
-    * GL_OES_texture_float) are filtered elsewhere.
-    */
-   if (_mesa_is_gles(ctx) && !_mesa_is_gles3(ctx)) {
-      err = _mesa_es_error_check_format_and_type(format, type, dimensions);
-      if (err != GL_NO_ERROR) {
-         _mesa_error(ctx, err, "%s(format = %s, type = %s)",
-                     callerName, _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type));
-         return GL_TRUE;
-      }
-   }
-
-   err = _mesa_error_check_format_and_type(ctx, format, type);
-   if (err != GL_NO_ERROR) {
-      _mesa_error(ctx, err,
-                  "%s(incompatible format = %s, type = %s)",
-                  callerName, _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
-      return GL_TRUE;
-   }
-
-   /* validate the bound PBO, if any */
-   if (!_mesa_validate_pbo_source(ctx, dimensions, &ctx->Unpack,
-                                  width, height, depth, format, type,
-                                  INT_MAX, pixels, callerName)) {
-      return GL_TRUE;
-   }
-
    texImage = _mesa_select_tex_image(texObj, target, level);
    if (!texImage) {
       /* non-existant texture level */
@@ -2531,6 +2526,34 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
+   err = _mesa_error_check_format_and_type(ctx, format, type);
+   if (err != GL_NO_ERROR) {
+      _mesa_error(ctx, err,
+                  "%s(incompatible format = %s, type = %s)",
+                  callerName, _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
+      return GL_TRUE;
+   }
+
+   /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
+    * combinations of format, internalFormat, and type that can be used.
+    * Formats and types that require additional extensions (e.g., GL_FLOAT
+    * requires GL_OES_texture_float) are filtered elsewhere.
+    */
+   if (_mesa_is_gles(ctx) &&
+       texture_format_error_check_gles(ctx, format, type,
+                                       texImage->InternalFormat,
+                                       dimensions, callerName)) {
+      return GL_TRUE;
+   }
+
+   /* validate the bound PBO, if any */
+   if (!_mesa_validate_pbo_source(ctx, dimensions, &ctx->Unpack,
+                                  width, height, depth, format, type,
+                                  INT_MAX, pixels, callerName)) {
+      return GL_TRUE;
+   }
+
    if (error_check_subtexture_dimensions(ctx, dimensions,
                                          texImage, xoffset, yoffset, zoffset,
                                          width, height, depth, callerName)) {
@@ -2590,7 +2613,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
    /* check target */
    if (!legal_texsubimage_target(ctx, dimensions, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glCopyTexImage%uD(target=%s)",
-                  dimensions, _mesa_lookup_enum_by_nr(target));
+                  dimensions, _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -2629,13 +2652,6 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
-   if (rb == NULL) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glCopyTexImage%dD(read buffer)", dimensions);
-      return GL_TRUE;
-   }
-
    /* OpenGL ES 1.x and OpenGL ES 2.0 impose additional restrictions on the
     * internalFormat.
     */
@@ -2648,18 +2664,25 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       case GL_LUMINANCE_ALPHA:
          break;
       default:
-         _mesa_error(ctx, GL_INVALID_VALUE,
+         _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
 
    baseFormat = _mesa_base_tex_format(ctx, internalFormat);
    if (baseFormat < 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
+      _mesa_error(ctx, GL_INVALID_ENUM,
                   "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                  _mesa_lookup_enum_by_nr(internalFormat));
+                  _mesa_enum_to_string(internalFormat));
+      return GL_TRUE;
+   }
+
+   rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
+   if (rb == NULL) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glCopyTexImage%dD(read buffer)", dimensions);
       return GL_TRUE;
    }
 
@@ -2669,7 +2692,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       if (rb_base_format < 0) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -2696,7 +2719,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       if (!valid) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -2735,10 +2758,10 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
        * types for SNORM formats. Also, conversion to SNORM formats is not
        * allowed by Table 3.2 on Page 110.
        */
-      if(_mesa_is_enum_format_snorm(internalFormat)) {
+      if (_mesa_is_enum_format_snorm(internalFormat)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glCopyTexImage%dD(internalFormat=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     _mesa_enum_to_string(internalFormat));
          return GL_TRUE;
       }
    }
@@ -3103,8 +3126,8 @@ _mesa_choose_texture_format(struct gl_context *ctx,
                        "DXT compression requested (%s), "
                        "but libtxc_dxtn library not installed.  Using %s "
                        "instead.",
-                       _mesa_lookup_enum_by_nr(before),
-                       _mesa_lookup_enum_by_nr(internalFormat));
+                       _mesa_enum_to_string(before),
+                       _mesa_enum_to_string(internalFormat));
       }
    }
 
@@ -3191,18 +3214,18 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
          _mesa_debug(ctx,
                      "glCompressedTexImage%uD %s %d %s %d %d %d %d %p\n",
                      dims,
-                     _mesa_lookup_enum_by_nr(target), level,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target), level,
+                     _mesa_enum_to_string(internalFormat),
                      width, height, depth, border, pixels);
       else
          _mesa_debug(ctx,
                      "glTexImage%uD %s %d %s %d %d %d %d %s %s %p\n",
                      dims,
-                     _mesa_lookup_enum_by_nr(target), level,
-                     _mesa_lookup_enum_by_nr(internalFormat),
+                     _mesa_enum_to_string(target), level,
+                     _mesa_enum_to_string(internalFormat),
                      width, height, depth, border,
-                     _mesa_lookup_enum_by_nr(format),
-                     _mesa_lookup_enum_by_nr(type), pixels);
+                     _mesa_enum_to_string(format),
+                     _mesa_enum_to_string(type), pixels);
    }
 
    internalFormat = override_internal_format(internalFormat, width, height);
@@ -3210,7 +3233,7 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
    /* target error checking */
    if (!legal_teximage_target(ctx, dims, target)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s%uD(target=%s)",
-                  func, dims, _mesa_lookup_enum_by_nr(target));
+                  func, dims, _mesa_enum_to_string(target));
       return;
    }
 
@@ -3313,16 +3336,16 @@ teximage(struct gl_context *ctx, GLboolean compressed, GLuint dims,
 
       if (!dimensionsOK) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glTexImage%uD(invalid width or height or depth)",
-                     dims);
+                     "%s%uD(invalid width or height or depth)",
+                     func, dims);
          return;
       }
 
       if (!sizeOK) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY,
-                     "glTexImage%uD(image too large: %d x %d x %d, %s format)",
-                     dims, width, height, depth,
-                     _mesa_lookup_enum_by_nr(internalFormat));
+                     "%s%uD(image too large: %d x %d x %d, %s format)",
+                     func, dims, width, height, depth,
+                     _mesa_enum_to_string(internalFormat));
          return;
       }
 
@@ -3495,7 +3518,6 @@ _mesa_EGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image)
       _mesa_dirty_texobj(ctx, texObj);
    }
    _mesa_unlock_texture(ctx, texObj);
-
 }
 
 
@@ -3515,14 +3537,6 @@ _mesa_texture_sub_image(struct gl_context *ctx, GLuint dims,
 {
    FLUSH_VERTICES(ctx, 0);
 
-   /* check target (proxies not allowed) */
-   if (!legal_texsubimage_target(ctx, dims, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sSubImage%uD(target=%s)",
-                  dsa ? "ture" : "",
-                  dims, _mesa_lookup_enum_by_nr(target));
-      return;
-   }
-
    if (ctx->NewState & _NEW_PIXEL)
       _mesa_update_state(ctx);
 
@@ -3572,6 +3586,13 @@ texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
 
+   /* check target (proxies not allowed) */
+   if (!legal_texsubimage_target(ctx, dims, target, false)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glTexSubImage%uD(target=%s)",
+                  dims, _mesa_enum_to_string(target));
+      return;
+   }
+
    texObj = _mesa_get_current_tex_object(ctx, target);
    if (!texObj)
       return;
@@ -3589,10 +3610,10 @@ texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexSubImage%uD %s %d %d %d %d %d %d %d %s %s %p\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), level,
+                  _mesa_enum_to_string(target), level,
                   xoffset, yoffset, zoffset, width, height, depth,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type), pixels);
 
    _mesa_texture_sub_image(ctx, dims, texObj, texImage, target, level,
                            xoffset, yoffset, zoffset, width, height, depth,
@@ -3621,8 +3642,8 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
                   "glTextureSubImage%uD %d %d %d %d %d %d %d %d %s %s %p\n",
                   dims, texture, level,
                   xoffset, yoffset, zoffset, width, height, depth,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type), pixels);
 
    /* Get the texture object by Name. */
    texObj = _mesa_lookup_texture(ctx, texture);
@@ -3632,6 +3653,13 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
       return;
    }
 
+   /* check target (proxies not allowed) */
+   if (!legal_texsubimage_target(ctx, dims, texObj->Target, true)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target=%s)",
+                  callerName, _mesa_enum_to_string(texObj->Target));
+      return;
+   }
+
    if (texsubimage_error_check(ctx, dims, texObj, texObj->Target, level,
                                xoffset, yoffset, zoffset,
                                width, height, depth, format, type,
@@ -3842,8 +3870,7 @@ copytexsubimage_by_slice(struct gl_context *ctx,
 }
 
 static GLboolean
-formats_differ_in_component_sizes (mesa_format f1,
-                                   mesa_format f2)
+formats_differ_in_component_sizes(mesa_format f1, mesa_format f2)
 {
    GLint f1_r_bits = _mesa_get_format_bits(f1, GL_RED_BITS);
    GLint f1_g_bits = _mesa_get_format_bits(f1, GL_GREEN_BITS);
@@ -3883,8 +3910,8 @@ copyteximage(struct gl_context *ctx, GLuint dims,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glCopyTexImage%uD %s %d %s %d %d %d %d %d\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat),
+                  _mesa_enum_to_string(target), level,
+                  _mesa_enum_to_string(internalFormat),
                   x, y, width, height, border);
 
    if (ctx->NewState & NEW_COPY_TEX_STATE)
@@ -3916,8 +3943,8 @@ copyteximage(struct gl_context *ctx, GLuint dims,
        */
          if (rb->InternalFormat == GL_RGB10_A2) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glCopyTexImage%uD(Reading from GL_RGB10_A2 buffer and"
-                           " writing to unsized internal format)", dims);
+                           "glCopyTexImage%uD(Reading from GL_RGB10_A2 buffer"
+                           " and writing to unsized internal format)", dims);
                return;
          }
       }
@@ -4043,7 +4070,7 @@ _mesa_copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "%s %s %d %d %d %d %d %d %d %d\n", caller,
-                  _mesa_lookup_enum_by_nr(target),
+                  _mesa_enum_to_string(target),
                   level, xoffset, yoffset, zoffset, x, y, width, height);
 
    if (ctx->NewState & NEW_COPY_TEX_STATE)
@@ -4105,7 +4132,7 @@ _mesa_CopyTexSubImage1D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 1, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4133,7 +4160,7 @@ _mesa_CopyTexSubImage2D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 2, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4162,7 +4189,7 @@ _mesa_CopyTexSubImage3D( GLenum target, GLint level,
     */
    if (!legal_texsubimage_target(ctx, 3, target, false)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
@@ -4190,7 +4217,7 @@ _mesa_CopyTextureSubImage1D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 1, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4214,7 +4241,7 @@ _mesa_CopyTextureSubImage2D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 2, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4241,7 +4268,7 @@ _mesa_CopyTextureSubImage3D(GLuint texture, GLint level,
    /* Check target (proxies not allowed). */
    if (!legal_texsubimage_target(ctx, 3, texObj->Target, true)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", self,
-                  _mesa_lookup_enum_by_nr(texObj->Target));
+                  _mesa_enum_to_string(texObj->Target));
       return;
    }
 
@@ -4288,8 +4315,8 @@ check_clear_tex_image(struct gl_context *ctx,
       _mesa_error(ctx, err,
                   "%s(incompatible format = %s, type = %s)",
                   function,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type));
+                  _mesa_enum_to_string(format),
+                  _mesa_enum_to_string(type));
       return false;
    }
 
@@ -4298,8 +4325,8 @@ check_clear_tex_image(struct gl_context *ctx,
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(incompatible internalFormat = %s, format = %s)",
                   function,
-                  _mesa_lookup_enum_by_nr(internalFormat),
-                  _mesa_lookup_enum_by_nr(format));
+                  _mesa_enum_to_string(internalFormat),
+                  _mesa_enum_to_string(format));
       return false;
    }
 
@@ -4541,7 +4568,7 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
 
    if (dsa && target == GL_TEXTURE_RECTANGLE) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -4549,13 +4576,15 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
    case 2:
       switch (target) {
       case GL_TEXTURE_2D:
+         targetOK = GL_TRUE;
+         break;
       case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
       case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
       case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
       case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-         targetOK = GL_TRUE;
+         targetOK = ctx->Extensions.ARB_texture_cube_map;
          break;
       default:
          targetOK = GL_FALSE;
@@ -4563,52 +4592,59 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
       }
       break;
    case 3:
-      targetOK = (target == GL_TEXTURE_3D) ||
-                 (target == GL_TEXTURE_2D_ARRAY) ||
-                 (target == GL_TEXTURE_CUBE_MAP_ARRAY) ||
-                 (target == GL_TEXTURE_CUBE_MAP && dsa);
-
-      /* OpenGL 4.5 spec (30.10.2014) says in Section 8.7 Compressed Texture
-       * Images:
-       *    "An INVALID_OPERATION error is generated by
-       *    CompressedTex*SubImage3D if the internal format of the texture is
-       *    one of the EAC, ETC2, or RGTC formats and either border is
-       *    non-zero, or the effective target for the texture is not
-       *    TEXTURE_2D_ARRAY."
-       */
-      if (target != GL_TEXTURE_2D_ARRAY) {
-         bool invalidformat;
+      switch (target) {
+      case GL_TEXTURE_CUBE_MAP:
+         targetOK = dsa && ctx->Extensions.ARB_texture_cube_map;
+         break;
+      case GL_TEXTURE_2D_ARRAY:
+         targetOK = _mesa_is_gles3(ctx) ||
+            (_mesa_is_desktop_gl(ctx) && ctx->Extensions.EXT_texture_array);
+         break;
+      case GL_TEXTURE_CUBE_MAP_ARRAY:
+         targetOK = ctx->Extensions.ARB_texture_cube_map_array;
+         break;
+      case GL_TEXTURE_3D:
+         targetOK = GL_TRUE;
+         /*
+          * OpenGL 4.5 spec (30.10.2014) says in Section 8.7 Compressed Texture
+          * Images:
+          *    "An INVALID_OPERATION error is generated by
+          *    CompressedTex*SubImage3D if the internal format of the texture
+          *    is one of the EAC, ETC2, or RGTC formats and either border is
+          *    non-zero, or the effective target for the texture is not
+          *    TEXTURE_2D_ARRAY."
+          *
+          * NOTE: that's probably a spec error.  It should probably say
+          *    "... or the effective target for the texture is not
+          *    TEXTURE_2D_ARRAY, TEXTURE_CUBE_MAP, nor
+          *    GL_TEXTURE_CUBE_MAP_ARRAY."
+          * since those targets are 2D images and they support all compression
+          * formats.
+          *
+          * Instead of listing all these, just list those which are allowed,
+          * which is (at this time) only bptc. Otherwise we'd say s3tc (and
+          * more) are valid here, which they are not, but of course not
+          * mentioned by core spec.
+          */
          switch (format) {
-            /* These came from _mesa_is_compressed_format in glformats.c. */
-            /* EAC formats */
-            case GL_COMPRESSED_RGBA8_ETC2_EAC:
-            case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
-            case GL_COMPRESSED_R11_EAC:
-            case GL_COMPRESSED_RG11_EAC:
-            case GL_COMPRESSED_SIGNED_R11_EAC:
-            case GL_COMPRESSED_SIGNED_RG11_EAC:
-            /* ETC2 formats */
-            case GL_COMPRESSED_RGB8_ETC2:
-            case GL_COMPRESSED_SRGB8_ETC2:
-            case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-            case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-            /* RGTC formats */
-            case GL_COMPRESSED_RED_RGTC1:
-            case GL_COMPRESSED_SIGNED_RED_RGTC1:
-            case GL_COMPRESSED_RG_RGTC2:
-            case GL_COMPRESSED_SIGNED_RG_RGTC2:
-               invalidformat = true;
-               break;
-            default:
-               invalidformat = false;
-         }
-         if (invalidformat) {
+         /* These are the only 3D compression formats supported at this time */
+         case GL_COMPRESSED_RGBA_BPTC_UNORM:
+         case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+         case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+         case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+            /* valid format */
+            break;
+         default:
+            /* invalid format */
             _mesa_error(ctx, GL_INVALID_OPERATION,
                         "%s(invalid target %s for format %s)", caller,
-                        _mesa_lookup_enum_by_nr(target),
-                        _mesa_lookup_enum_by_nr(format));
+                        _mesa_enum_to_string(target),
+                        _mesa_enum_to_string(format));
             return GL_TRUE;
          }
+         break;
+      default:
+         targetOK = GL_FALSE;
       }
 
       break;
@@ -4621,7 +4657,7 @@ compressed_subtexture_target_check(struct gl_context *ctx, GLenum target,
 
    if (!targetOK) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", caller,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return GL_TRUE;
    }
 
@@ -4834,8 +4870,7 @@ _mesa_CompressedTextureSubImage1D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 1, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 1, format, true,
                                           "glCompressedTextureSubImage1D")) {
       return;
    }
@@ -4912,8 +4947,7 @@ _mesa_CompressedTextureSubImage2D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 2, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 2, format, true,
                                           "glCompressedTextureSubImage2D")) {
       return;
    }
@@ -4990,8 +5024,7 @@ _mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
    if (!texObj)
       return;
 
-   if (compressed_subtexture_target_check(ctx, texObj->Target, 3, format,
-                                          true,
+   if (compressed_subtexture_target_check(ctx, texObj->Target, 3, format, true,
                                           "glCompressedTextureSubImage3D")) {
       return;
    }
@@ -5440,7 +5473,6 @@ _mesa_TexBufferRange(GLenum target, GLenum internalFormat, GLuint buffer,
          return;
 
    } else {
-
       /* OpenGL 4.5 core spec (02.02.2015) says in Section 8.9 Buffer
        * Textures (PDF page 254):
        *    "If buffer is zero, then any buffer object attached to the buffer
@@ -5508,7 +5540,6 @@ _mesa_TextureBufferRange(GLuint texture, GLenum internalFormat, GLuint buffer,
          return;
 
    } else {
-
       /* OpenGL 4.5 core spec (02.02.2015) says in Section 8.9 Buffer
        * Textures (PDF page 254):
        *    "If buffer is zero, then any buffer object attached to the buffer
@@ -5554,19 +5585,17 @@ check_multisample_target(GLuint dims, GLenum target, bool dsa)
       return dims == 2;
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
       return dims == 2 && !dsa;
-
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return dims == 3;
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return dims == 3 && !dsa;
-
    default:
       return GL_FALSE;
    }
 }
 
 
-void
+static void
 _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
                                 struct gl_texture_object *texObj,
                                 GLenum target, GLsizei samples,
@@ -5581,8 +5610,8 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    GLenum sample_count_error;
    bool dsa = strstr(func, "ture") ? true : false;
 
-   if (!(ctx->Extensions.ARB_texture_multisample
-      && _mesa_is_desktop_gl(ctx))) {
+   if (!((ctx->Extensions.ARB_texture_multisample
+         && _mesa_is_desktop_gl(ctx))) && !_mesa_is_gles31(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(unsupported)", func);
       return;
    }
@@ -5605,14 +5634,21 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    if (immutable && !_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
             "%s(internalformat=%s not legal for immutable-format)",
-            func, _mesa_lookup_enum_by_nr(internalformat));
+            func, _mesa_enum_to_string(internalformat));
       return;
    }
 
    if (!is_renderable_texture_format(ctx, internalformat)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-            "%s(internalformat=%s)",
-            func, _mesa_lookup_enum_by_nr(internalformat));
+      /* Page 172 of OpenGL ES 3.1 spec says:
+       *   "An INVALID_ENUM error is generated if sizedinternalformat is not
+       *   color-renderable, depth-renderable, or stencil-renderable (as
+       *   defined in section 9.4).
+       *
+       *  (Same error is also defined for desktop OpenGL for multisample
+       *  teximage/texstorage functions.)
+       */
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(internalformat=%s)", func,
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -5671,13 +5707,12 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
    else {
       if (!dimensionsOK) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-               "%s(invalid width or height)", func);
+                     "%s(invalid width or height)", func);
          return;
       }
 
       if (!sizeOK) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY,
-               "%s(texture too large)", func);
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s(texture too large)", func);
          return;
       }
 
@@ -5695,7 +5730,7 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
 
       if (width > 0 && height > 0 && depth > 0) {
          if (!ctx->Driver.AllocTextureStorage(ctx, texObj, 1,
-                  width, height, depth)) {
+                                              width, height, depth)) {
             /* tidy up the texture image state. strictly speaking,
              * we're allowed to just leave this in whatever state we
              * like, but being tidy is good.
diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h
index 1eebaa8b631..bf729daf534 100644
--- a/src/mesa/main/teximage.h
+++ b/src/mesa/main/teximage.h
@@ -199,15 +199,6 @@ _mesa_copy_texture_sub_image(struct gl_context *ctx, GLuint dims,
                              GLsizei width, GLsizei height,
                              const char *caller);
 
-extern void
-_mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
-                                struct gl_texture_object *texObj,
-                                GLenum target, GLsizei samples,
-                                GLint internalformat, GLsizei width,
-                                GLsizei height, GLsizei depth,
-                                GLboolean fixedsamplelocations,
-                                GLboolean immutable, const char *func);
-
 extern void
 _mesa_texture_buffer_range(struct gl_context *ctx,
                            struct gl_texture_object *texObj,
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index c563f1e7434..cd7cfd6a4fb 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -1255,7 +1255,7 @@ create_textures(struct gl_context *ctx, GLenum target,
          if (targetIndex < 0) { /* Bad Target */
             mtx_unlock(&ctx->Shared->Mutex);
             _mesa_error(ctx, GL_INVALID_ENUM, "gl%sTextures(target = %s)",
-                        func, _mesa_lookup_enum_by_nr(texObj->Target));
+                        func, _mesa_enum_to_string(texObj->Target));
             return;
          }
          assert(targetIndex < NUM_TEXTURE_TARGETS);
@@ -1606,8 +1606,8 @@ _mesa_tex_target_to_index(const struct gl_context *ctx, GLenum target)
       return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_cube_map_array
          ? TEXTURE_CUBE_ARRAY_INDEX : -1;
    case GL_TEXTURE_2D_MULTISAMPLE:
-      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
-         ? TEXTURE_2D_MULTISAMPLE_INDEX: -1;
+      return ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample) ||
+              _mesa_is_gles31(ctx)) ? TEXTURE_2D_MULTISAMPLE_INDEX: -1;
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
          ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: -1;
@@ -1642,7 +1642,7 @@ _mesa_BindTexture( GLenum target, GLuint texName )
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glBindTexture %s %d\n",
-                  _mesa_lookup_enum_by_nr(target), (GLint) texName);
+                  _mesa_enum_to_string(target), (GLint) texName);
 
    targetIndex = _mesa_tex_target_to_index(ctx, target);
    if (targetIndex < 0) {
@@ -1806,7 +1806,7 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture)
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glBindTextureUnit %s %d\n",
-                  _mesa_lookup_enum_by_nr(GL_TEXTURE0+unit), (GLint) texture);
+                  _mesa_enum_to_string(GL_TEXTURE0+unit), (GLint) texture);
 
    /* Section 8.1 (Texture Objects) of the OpenGL 4.5 core profile spec
     * (20141030) says:
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index d74134f41b1..c0611c3e489 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -381,7 +381,7 @@ set_tex_parameteri(struct gl_context *ctx,
       if (texObj->Target == GL_TEXTURE_RECTANGLE_ARB && params[0] != 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glTex%sParameter(target=%s, param=%d)", suffix,
-                     _mesa_lookup_enum_by_nr(texObj->Target), params[0]);
+                     _mesa_enum_to_string(texObj->Target), params[0]);
          return GL_FALSE;
       }
       incomplete(ctx, texObj);
@@ -500,7 +500,9 @@ set_tex_parameteri(struct gl_context *ctx,
       goto invalid_pname;
 
    case GL_DEPTH_STENCIL_TEXTURE_MODE:
-      if (_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_stencil_texturing) {
+      if ((_mesa_is_desktop_gl(ctx) &&
+           ctx->Extensions.ARB_stencil_texturing) ||
+          _mesa_is_gles31(ctx)) {
          bool stencil = params[0] == GL_STENCIL_INDEX;
          if (!stencil && params[0] != GL_DEPTH_COMPONENT)
             goto invalid_param;
@@ -610,22 +612,22 @@ set_tex_parameteri(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_param:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(param=%s)",
-               suffix, _mesa_lookup_enum_by_nr(params[0]));
+               suffix, _mesa_enum_to_string(params[0]));
    return GL_FALSE;
 
 invalid_operation:
    _mesa_error(ctx, GL_INVALID_OPERATION, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 }
 
@@ -683,7 +685,7 @@ set_tex_parameterf(struct gl_context *ctx,
 
          if (texObj->Sampler.MaxAnisotropy == params[0])
             return GL_FALSE;
-         if (params[0] < 1.0) {
+         if (params[0] < 1.0F) {
             _mesa_error(ctx, GL_INVALID_VALUE, "glTex%sParameter(param)",
                         suffix);
             return GL_FALSE;
@@ -745,12 +747,12 @@ set_tex_parameterf(struct gl_context *ctx,
 
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 
 invalid_enum:
    _mesa_error(ctx, GL_INVALID_ENUM, "glTex%sParameter(pname=%s)",
-               suffix, _mesa_lookup_enum_by_nr(pname));
+               suffix, _mesa_enum_to_string(pname));
    return GL_FALSE;
 }
 
@@ -1395,7 +1397,7 @@ get_tex_level_parameter_image(struct gl_context *ctx,
     else {
        _mesa_error(ctx, GL_INVALID_OPERATION,
                    "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-                   _mesa_lookup_enum_by_nr(pname));
+                   _mesa_enum_to_string(pname));
     }
          break;
       case GL_TEXTURE_COMPRESSED:
@@ -1444,7 +1446,7 @@ get_tex_level_parameter_image(struct gl_context *ctx,
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1528,7 +1530,7 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
          /* Always illegal for GL_TEXTURE_BUFFER */
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-                     _mesa_lookup_enum_by_nr(pname));
+                     _mesa_enum_to_string(pname));
          break;
 
       /* GL_ARB_texture_float */
@@ -1557,7 +1559,7 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
 invalid_pname:
    _mesa_error(ctx, GL_INVALID_ENUM,
                "glGetTex%sLevelParameter[if]v(pname=%s)", suffix,
-               _mesa_lookup_enum_by_nr(pname));
+               _mesa_enum_to_string(pname));
 }
 
 
@@ -1586,7 +1588,7 @@ get_tex_level_parameteriv(struct gl_context *ctx,
    if (!legal_get_tex_level_parameter_target(ctx, target, dsa)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetTex%sLevelParameter[if]v(target=%s)", suffix,
-                  _mesa_lookup_enum_by_nr(target));
+                  _mesa_enum_to_string(target));
       return;
    }
 
diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index 1af9d47f030..9b5928c4306 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -123,21 +123,21 @@ _mesa_print_texunit_state( struct gl_context *ctx, GLuint unit )
 {
    const struct gl_texture_unit *texUnit = ctx->Texture.Unit + unit;
    printf("Texture Unit %d\n", unit);
-   printf("  GL_TEXTURE_ENV_MODE = %s\n", _mesa_lookup_enum_by_nr(texUnit->EnvMode));
-   printf("  GL_COMBINE_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.ModeRGB));
-   printf("  GL_COMBINE_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.ModeA));
-   printf("  GL_SOURCE0_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[0]));
-   printf("  GL_SOURCE1_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[1]));
-   printf("  GL_SOURCE2_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceRGB[2]));
-   printf("  GL_SOURCE0_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[0]));
-   printf("  GL_SOURCE1_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[1]));
-   printf("  GL_SOURCE2_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.SourceA[2]));
-   printf("  GL_OPERAND0_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[0]));
-   printf("  GL_OPERAND1_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[1]));
-   printf("  GL_OPERAND2_RGB = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandRGB[2]));
-   printf("  GL_OPERAND0_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[0]));
-   printf("  GL_OPERAND1_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[1]));
-   printf("  GL_OPERAND2_ALPHA = %s\n", _mesa_lookup_enum_by_nr(texUnit->Combine.OperandA[2]));
+   printf("  GL_TEXTURE_ENV_MODE = %s\n", _mesa_enum_to_string(texUnit->EnvMode));
+   printf("  GL_COMBINE_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.ModeRGB));
+   printf("  GL_COMBINE_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.ModeA));
+   printf("  GL_SOURCE0_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[0]));
+   printf("  GL_SOURCE1_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[1]));
+   printf("  GL_SOURCE2_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceRGB[2]));
+   printf("  GL_SOURCE0_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[0]));
+   printf("  GL_SOURCE1_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[1]));
+   printf("  GL_SOURCE2_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.SourceA[2]));
+   printf("  GL_OPERAND0_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[0]));
+   printf("  GL_OPERAND1_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[1]));
+   printf("  GL_OPERAND2_RGB = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandRGB[2]));
+   printf("  GL_OPERAND0_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[0]));
+   printf("  GL_OPERAND1_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[1]));
+   printf("  GL_OPERAND2_ALPHA = %s\n", _mesa_enum_to_string(texUnit->Combine.OperandA[2]));
    printf("  GL_RGB_SCALE = %d\n", 1 << texUnit->Combine.ScaleShiftRGB);
    printf("  GL_ALPHA_SCALE = %d\n", 1 << texUnit->Combine.ScaleShiftA);
    printf("  GL_TEXTURE_ENV_COLOR = (%f, %f, %f, %f)\n", texUnit->EnvColor[0], texUnit->EnvColor[1], texUnit->EnvColor[2], texUnit->EnvColor[3]);
@@ -289,23 +289,23 @@ _mesa_ActiveTexture(GLenum texture)
    GLuint k;
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
+      _mesa_debug(ctx, "glActiveTexture %s\n",
+                  _mesa_enum_to_string(texture));
+
+   if (ctx->Texture.CurrentUnit == texUnit)
+      return;
+
    k = _mesa_max_tex_unit(ctx);
 
    assert(k <= ARRAY_SIZE(ctx->Texture.Unit));
 
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glActiveTexture %s\n",
-                  _mesa_lookup_enum_by_nr(texture));
-
    if (texUnit >= k) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glActiveTexture(texture=%s)",
-                  _mesa_lookup_enum_by_nr(texture));
+                  _mesa_enum_to_string(texture));
       return;
    }
 
-   if (ctx->Texture.CurrentUnit == texUnit)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_TEXTURE);
 
    ctx->Texture.CurrentUnit = texUnit;
@@ -325,16 +325,16 @@ _mesa_ClientActiveTexture(GLenum texture)
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glClientActiveTexture %s\n",
-                  _mesa_lookup_enum_by_nr(texture));
+                  _mesa_enum_to_string(texture));
+
+   if (ctx->Array.ActiveTexture == texUnit)
+      return;
 
    if (texUnit >= ctx->Const.MaxTextureCoordUnits) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture)");
       return;
    }
 
-   if (ctx->Array.ActiveTexture == texUnit)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_ARRAY);
    ctx->Array.ActiveTexture = texUnit;
 }
diff --git a/src/mesa/main/texstate.h b/src/mesa/main/texstate.h
index 662435b47cc..bee8c9c3316 100644
--- a/src/mesa/main/texstate.h
+++ b/src/mesa/main/texstate.h
@@ -77,7 +77,7 @@ _mesa_get_tex_unit_err(struct gl_context *ctx, GLuint unit, const char *func)
     *     implementation."
     */
    _mesa_error(ctx, GL_INVALID_OPERATION, "%s(unit=%s)", func,
-               _mesa_lookup_enum_by_nr(GL_TEXTURE0+unit));
+               _mesa_enum_to_string(GL_TEXTURE0+unit));
    return NULL;
 }
 
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index 53cb2c091f8..4a2cc6065df 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -308,7 +308,8 @@ tex_storage_error_check(struct gl_context *ctx,
       _mesa_error(ctx, _mesa_is_desktop_gl(ctx)?
                   GL_INVALID_ENUM : GL_INVALID_OPERATION,
                   "glTex%sStorage%dD(internalformat = %s)", suffix, dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
+      return GL_TRUE;
    }
 
    /* levels check */
@@ -464,21 +465,21 @@ texstorage(GLuint dims, GLenum target, GLsizei levels, GLenum internalformat,
    if (!legal_texobj_target(ctx, dims, target)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTexStorage%uD(illegal target=%s)",
-                  dims, _mesa_lookup_enum_by_nr(target));
+                  dims, _mesa_enum_to_string(target));
       return;
    }
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTexStorage%uD %s %d %s %d %d %d\n",
                   dims,
-                  _mesa_lookup_enum_by_nr(target), levels,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  _mesa_enum_to_string(target), levels,
+                  _mesa_enum_to_string(internalformat),
                   width, height, depth);
    /* Check the format to make sure it is sized. */
    if (!_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTexStorage%uD(internalformat = %s)", dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -504,14 +505,14 @@ texturestorage(GLuint dims, GLuint texture, GLsizei levels,
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTextureStorage%uD %d %d %s %d %d %d\n",
                   dims, texture, levels,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  _mesa_enum_to_string(internalformat),
                   width, height, depth);
 
    /* Check the format to make sure it is sized. */
    if (!_mesa_is_legal_tex_storage_format(ctx, internalformat)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTextureStorage%uD(internalformat = %s)", dims,
-                  _mesa_lookup_enum_by_nr(internalformat));
+                  _mesa_enum_to_string(internalformat));
       return;
    }
 
@@ -529,7 +530,7 @@ texturestorage(GLuint dims, GLuint texture, GLsizei levels,
    if (!legal_texobj_target(ctx, dims, texObj->Target)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glTextureStorage%uD(illegal target=%s)",
-                  dims, _mesa_lookup_enum_by_nr(texObj->Target));
+                  dims, _mesa_enum_to_string(texObj->Target));
       return;
    }
 
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 1525205981b..37c05690091 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -787,6 +787,7 @@ texstore_rgba(TEXSTORE_PARAMS)
       srcType = GL_FLOAT;
       srcRowStride = srcWidth * 4 * sizeof(float);
       srcMesaFormat = RGBA32_FLOAT;
+      srcPacking = &ctx->DefaultPacking;
    }
 
    src = (GLubyte *)
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index 6b0aed4ea1a..5a3282a40c1 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -313,7 +313,7 @@ target_valid(struct gl_context *ctx, GLenum origTarget, GLenum newTarget)
    }
    _mesa_error(ctx, GL_INVALID_OPERATION,
                "glTextureView(illegal target=%s)",
-               _mesa_lookup_enum_by_nr(newTarget));
+               _mesa_enum_to_string(newTarget));
    return false;
 }
 #undef RETURN_IF_SUPPORTED
@@ -435,8 +435,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
 
    if (MESA_VERBOSE & (VERBOSE_API | VERBOSE_TEXTURE))
       _mesa_debug(ctx, "glTextureView %d %s %d %s %d %d %d %d\n",
-                  texture, _mesa_lookup_enum_by_nr(target), origtexture,
-                  _mesa_lookup_enum_by_nr(internalformat),
+                  texture, _mesa_enum_to_string(target), origtexture,
+                  _mesa_enum_to_string(internalformat),
                   minlevel, numlevels, minlayer, numlayers);
 
    if (origtexture == 0) {
@@ -523,8 +523,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
                                    internalformat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
           "glTextureView(internalformat %s not compatible with origtexture %s)",
-          _mesa_lookup_enum_by_nr(internalformat),
-          _mesa_lookup_enum_by_nr(origTexObj->Image[0][0]->InternalFormat));
+          _mesa_enum_to_string(internalformat),
+          _mesa_enum_to_string(origTexObj->Image[0][0]->InternalFormat));
       return;
    }
 
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index cab5083e81b..036530e91b6 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -978,81 +978,6 @@ _mesa_uniform_matrix(struct gl_context *ctx, struct gl_shader_program *shProg,
 }
 
 
-/**
- * Called via glGetUniformLocation().
- *
- * Returns the uniform index into UniformStorage (also the
- * glGetActiveUniformsiv uniform index), and stores the referenced
- * array offset in *offset, or GL_INVALID_INDEX (-1).
- */
-extern "C" unsigned
-_mesa_get_uniform_location(struct gl_shader_program *shProg,
-                           const GLchar *name,
-                           unsigned *out_offset)
-{
-   /* Page 80 (page 94 of the PDF) of the OpenGL 2.1 spec says:
-    *
-    *     "The first element of a uniform array is identified using the
-    *     name of the uniform array appended with "[0]". Except if the last
-    *     part of the string name indicates a uniform array, then the
-    *     location of the first element of that array can be retrieved by
-    *     either using the name of the uniform array, or the name of the
-    *     uniform array appended with "[0]"."
-    *
-    * Note: since uniform names are not allowed to use whitespace, and array
-    * indices within uniform names are not allowed to use "+", "-", or leading
-    * zeros, it follows that each uniform has a unique name up to the possible
-    * ambiguity with "[0]" noted above.  Therefore we don't need to worry
-    * about mal-formed inputs--they will properly fail when we try to look up
-    * the uniform name in shProg->UniformHash.
-    */
-
-   const GLchar *base_name_end;
-   long offset = parse_program_resource_name(name, &base_name_end);
-   bool array_lookup = offset >= 0;
-   char *name_copy;
-
-   if (array_lookup) {
-      name_copy = (char *) malloc(base_name_end - name + 1);
-      memcpy(name_copy, name, base_name_end - name);
-      name_copy[base_name_end - name] = '\0';
-   } else {
-      name_copy = (char *) name;
-      offset = 0;
-   }
-
-   unsigned location = 0;
-   const bool found = shProg->UniformHash->get(location, name_copy);
-
-   assert(!found
-	  || strcmp(name_copy, shProg->UniformStorage[location].name) == 0);
-
-   /* Free the temporary buffer *before* possibly returning an error.
-    */
-   if (name_copy != name)
-      free(name_copy);
-
-   if (!found)
-      return GL_INVALID_INDEX;
-
-   /* If the uniform is built-in, fail. */
-   if (shProg->UniformStorage[location].builtin)
-      return GL_INVALID_INDEX;
-
-   /* If the uniform is an array, fail if the index is out of bounds.
-    * (A negative index is caught above.)  This also fails if the uniform
-    * is not an array, but the user is trying to index it, because
-    * array_elements is zero and offset >= 0.
-    */
-   if (array_lookup
-       && offset >= (long) shProg->UniformStorage[location].array_elements) {
-      return GL_INVALID_INDEX;
-   }
-
-   *out_offset = offset;
-   return location;
-}
-
 extern "C" bool
 _mesa_sampler_uniforms_are_valid(const struct gl_shader_program *shProg,
 				 char *errMsg, size_t errMsgLength)
@@ -1101,18 +1026,23 @@ _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *pipeline)
       for (unsigned i = 0; i < shProg[idx]->NumUniformStorage; i++) {
          const struct gl_uniform_storage *const storage =
             &shProg[idx]->UniformStorage[i];
-         const glsl_type *const t = (storage->type->is_array())
-            ? storage->type->fields.array : storage->type;
 
-         if (!t->is_sampler())
+         if (!storage->type->is_sampler())
             continue;
 
          active_samplers++;
 
-         const unsigned count = MAX2(1, storage->type->array_size());
+         const unsigned count = MAX2(1, storage->array_elements);
          for (unsigned j = 0; j < count; j++) {
             const unsigned unit = storage->storage[j].i;
 
+            /* FIXME: Samplers are initialized to 0 and Mesa doesn't do a
+             * great job of eliminating unused uniforms currently so for now
+             * don't throw an error if two sampler types both point to 0.
+             */
+            if (unit == 0)
+               continue;
+
             /* The types of the samplers associated with a particular texture
              * unit must be an exact match.  Page 74 (page 89 of the PDF) of
              * the OpenGL 3.3 core spec says:
@@ -1122,13 +1052,14 @@ _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *pipeline)
              *     program object."
              */
             if (unit_types[unit] == NULL) {
-               unit_types[unit] = t;
-            } else if (unit_types[unit] != t) {
+               unit_types[unit] = storage->type;
+            } else if (unit_types[unit] != storage->type) {
                pipeline->InfoLog =
                   ralloc_asprintf(pipeline,
                                   "Texture unit %d is accessed both as %s "
                                   "and %s",
-                                  unit, unit_types[unit]->name, t->name);
+                                  unit, unit_types[unit]->name,
+                                  storage->type->name);
                return false;
             }
          }
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index 5548d1d026f..ff1df72e1d6 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -952,7 +952,7 @@ _mesa_GetUniformBlockIndex(GLuint program,
 
    struct gl_program_resource *res =
       _mesa_program_resource_find_name(shProg, GL_UNIFORM_BLOCK,
-                                       uniformBlockName);
+                                       uniformBlockName, NULL);
    if (!res)
       return GL_INVALID_INDEX;
 
@@ -987,7 +987,8 @@ _mesa_GetUniformIndices(GLuint program,
 
    for (i = 0; i < uniformCount; i++) {
       struct gl_program_resource *res =
-         _mesa_program_resource_find_name(shProg, GL_UNIFORM, uniformNames[i]);
+         _mesa_program_resource_find_name(shProg, GL_UNIFORM, uniformNames[i],
+                                          NULL);
       uniformIndices[i] = _mesa_program_resource_index(shProg, res);
    }
 }
@@ -1092,6 +1093,21 @@ mesa_bufferiv(struct gl_shader_program *shProg, GLenum type,
                                   GL_REFERENCED_BY_VERTEX_SHADER, params,
                                   caller);
       return;
+
+   case GL_UNIFORM_BLOCK_REFERENCED_BY_TESS_CONTROL_SHADER:
+   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_CONTROL_SHADER:
+      _mesa_program_resource_prop(shProg, res, index,
+                                  GL_REFERENCED_BY_TESS_CONTROL_SHADER, params,
+                                  caller);
+      return;
+
+   case GL_UNIFORM_BLOCK_REFERENCED_BY_TESS_EVALUATION_SHADER:
+   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_EVALUATION_SHADER:
+      _mesa_program_resource_prop(shProg, res, index,
+                                  GL_REFERENCED_BY_TESS_EVALUATION_SHADER, params,
+                                  caller);
+      return;
+
    case GL_UNIFORM_BLOCK_REFERENCED_BY_GEOMETRY_SHADER:
    case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_GEOMETRY_SHADER:
       _mesa_program_resource_prop(shProg, res, index,
@@ -1104,16 +1120,10 @@ mesa_bufferiv(struct gl_shader_program *shProg, GLenum type,
                                   GL_REFERENCED_BY_FRAGMENT_SHADER, params,
                                   caller);
       return;
-   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_CONTROL_SHADER:
-      params[0] = GL_FALSE;
-      return;
-   case GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_TESS_EVALUATION_SHADER:
-      params[0] = GL_FALSE;
-      return;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "%s(pname 0x%x (%s))", caller, pname,
-                  _mesa_lookup_enum_by_nr(pname));
+                  _mesa_enum_to_string(pname));
       return;
    }
 }
diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h
index bd7b05e207a..e62eaa53ccc 100644
--- a/src/mesa/main/uniforms.h
+++ b/src/mesa/main/uniforms.h
@@ -343,10 +343,6 @@ void GLAPIENTRY
 _mesa_ProgramUniformMatrix4x3dv(GLuint program, GLint location, GLsizei count,
                                 GLboolean transpose, const GLdouble *value);
 
-unsigned
-_mesa_get_uniform_location(struct gl_shader_program *shProg,
-			   const GLchar *name, unsigned *offset);
-
 void
 _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shader_program,
 	      GLint location, GLsizei count,
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index ebdd9eaf02e..3bab9850588 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -300,7 +300,7 @@ update_array_format(struct gl_context *ctx,
    typeBit = type_to_bit(ctx, type);
    if (typeBit == 0x0 || (typeBit & legalTypesMask) == 0x0) {
       _mesa_error(ctx, GL_INVALID_ENUM, "%s(type = %s)",
-                  func, _mesa_lookup_enum_by_nr(type));
+                  func, _mesa_enum_to_string(type));
       return false;
    }
 
@@ -333,7 +333,7 @@ update_array_format(struct gl_context *ctx,
 
       if (bgra_error) {
          _mesa_error(ctx, GL_INVALID_OPERATION, "%s(size=GL_BGRA and type=%s)",
-                     func, _mesa_lookup_enum_by_nr(type));
+                     func, _mesa_enum_to_string(type));
          return false;
       }
 
@@ -2310,7 +2310,7 @@ print_array(const char *name, GLint index, const struct gl_client_array *array)
    else
       fprintf(stderr, "  %s: ", name);
    fprintf(stderr, "Ptr=%p, Type=%s, Size=%d, ElemSize=%u, Stride=%d, Buffer=%u(Size %lu)\n",
-           array->Ptr, _mesa_lookup_enum_by_nr(array->Type), array->Size,
+           array->Ptr, _mesa_enum_to_string(array->Type), array->Size,
            array->_ElementSize, array->StrideB, array->BufferObj->Name,
            (unsigned long) array->BufferObj->Size);
 }
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 8bc00ace5c4..fd7ae53ccbd 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -309,7 +309,7 @@ compute_version(const struct gl_extensions *extensions,
                          extensions->ARB_gpu_shader5 &&
                          extensions->ARB_gpu_shader_fp64 &&
                          extensions->ARB_sample_shading &&
-                         false /*extensions->ARB_shader_subroutine*/ &&
+                         extensions->ARB_shader_subroutine &&
                          extensions->ARB_tessellation_shader &&
                          extensions->ARB_texture_buffer_object_rgb32 &&
                          extensions->ARB_texture_cube_map_array &&
diff --git a/src/mesa/main/viewport.c b/src/mesa/main/viewport.c
index b27063031c4..7d8914291c3 100644
--- a/src/mesa/main/viewport.c
+++ b/src/mesa/main/viewport.c
@@ -391,8 +391,8 @@ _mesa_ClipControl(GLenum origin, GLenum depth)
 
    if (MESA_VERBOSE&VERBOSE_API)
       _mesa_debug(ctx, "glClipControl(%s, %s)\n",
-	          _mesa_lookup_enum_by_nr(origin),
-                  _mesa_lookup_enum_by_nr(depth));
+	          _mesa_enum_to_string(origin),
+                  _mesa_enum_to_string(depth));
 
    ASSERT_OUTSIDE_BEGIN_END(ctx);
 
@@ -443,12 +443,12 @@ _mesa_ClipControl(GLenum origin, GLenum depth)
  */
 void
 _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
-                         double scale[3], double translate[3])
+                         float scale[3], float translate[3])
 {
-   double x = ctx->ViewportArray[i].X;
-   double y = ctx->ViewportArray[i].Y;
-   double half_width = 0.5*ctx->ViewportArray[i].Width;
-   double half_height = 0.5*ctx->ViewportArray[i].Height;
+   float x = ctx->ViewportArray[i].X;
+   float y = ctx->ViewportArray[i].Y;
+   float half_width = 0.5f * ctx->ViewportArray[i].Width;
+   float half_height = 0.5f * ctx->ViewportArray[i].Height;
    double n = ctx->ViewportArray[i].Near;
    double f = ctx->ViewportArray[i].Far;
 
@@ -462,8 +462,8 @@ _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
       translate[1] = half_height + y;
    }
    if (ctx->Transform.ClipDepthMode == GL_NEGATIVE_ONE_TO_ONE) {
-      scale[2] = 0.5*(f - n);
-      translate[2] = 0.5*(n + f);
+      scale[2] = 0.5 * (f - n);
+      translate[2] = 0.5 * (n + f);
    } else {
       scale[2] = f - n;
       translate[2] = n;
diff --git a/src/mesa/main/viewport.h b/src/mesa/main/viewport.h
index 899dc2d0bcc..b0675db1096 100644
--- a/src/mesa/main/viewport.h
+++ b/src/mesa/main/viewport.h
@@ -73,6 +73,6 @@ _mesa_ClipControl(GLenum origin, GLenum depth);
 
 extern void
 _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
-                         double scale[3], double translate[3]);
+                         float scale[3], float translate[3]);
 
 #endif
diff --git a/src/mesa/math/m_clip_tmp.h b/src/mesa/math/m_clip_tmp.h
index e289be7b302..60c00043725 100644
--- a/src/mesa/math/m_clip_tmp.h
+++ b/src/mesa/math/m_clip_tmp.h
@@ -194,13 +194,13 @@ static GLvector4f * TAG(cliptest_points3)( GLvector4f *clip_vec,
    STRIDE_LOOP {
       const GLfloat cx = from[0], cy = from[1], cz = from[2];
       GLubyte mask = 0;
-      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
-      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
-      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
-      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
+      if (cx >  1.0F)       mask |= CLIP_RIGHT_BIT;
+      else if (cx < -1.0F)  mask |= CLIP_LEFT_BIT;
+      if (cy >  1.0F)       mask |= CLIP_TOP_BIT;
+      else if (cy < -1.0F)  mask |= CLIP_BOTTOM_BIT;
       if (viewport_z_clip) {
-	 if (cz >  1.0)       mask |= CLIP_FAR_BIT;
-	 else if (cz < -1.0)  mask |= CLIP_NEAR_BIT;
+	 if (cz >  1.0F)       mask |= CLIP_FAR_BIT;
+	 else if (cz < -1.0F)  mask |= CLIP_NEAR_BIT;
       }
       clipMask[i] = mask;
       tmpOrMask |= mask;
@@ -230,10 +230,10 @@ static GLvector4f * TAG(cliptest_points2)( GLvector4f *clip_vec,
    STRIDE_LOOP {
       const GLfloat cx = from[0], cy = from[1];
       GLubyte mask = 0;
-      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
-      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
-      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
-      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
+      if (cx >  1.0F)       mask |= CLIP_RIGHT_BIT;
+      else if (cx < -1.0F)  mask |= CLIP_LEFT_BIT;
+      if (cy >  1.0F)       mask |= CLIP_TOP_BIT;
+      else if (cy < -1.0F)  mask |= CLIP_BOTTOM_BIT;
       clipMask[i] = mask;
       tmpOrMask |= mask;
       tmpAndMask &= mask;
diff --git a/src/mesa/math/m_matrix.c b/src/mesa/math/m_matrix.c
index ecf564c0089..6522200b345 100644
--- a/src/mesa/math/m_matrix.c
+++ b/src/mesa/math/m_matrix.c
@@ -380,7 +380,7 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    if (fabsf(r3[0])>fabsf(r2[0])) SWAP_ROWS(r3, r2);
    if (fabsf(r2[0])>fabsf(r1[0])) SWAP_ROWS(r2, r1);
    if (fabsf(r1[0])>fabsf(r0[0])) SWAP_ROWS(r1, r0);
-   if (0.0 == r0[0])  return GL_FALSE;
+   if (0.0F == r0[0])  return GL_FALSE;
 
    /* eliminate first variable     */
    m1 = r1[0]/r0[0]; m2 = r2[0]/r0[0]; m3 = r3[0]/r0[0];
@@ -388,31 +388,31 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    s = r0[2]; r1[2] -= m1 * s; r2[2] -= m2 * s; r3[2] -= m3 * s;
    s = r0[3]; r1[3] -= m1 * s; r2[3] -= m2 * s; r3[3] -= m3 * s;
    s = r0[4];
-   if (s != 0.0) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   if (s != 0.0F) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
    s = r0[5];
-   if (s != 0.0) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   if (s != 0.0F) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
    s = r0[6];
-   if (s != 0.0) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   if (s != 0.0F) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
    s = r0[7];
-   if (s != 0.0) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
+   if (s != 0.0F) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
 
    /* choose pivot - or die */
    if (fabsf(r3[1])>fabsf(r2[1])) SWAP_ROWS(r3, r2);
    if (fabsf(r2[1])>fabsf(r1[1])) SWAP_ROWS(r2, r1);
-   if (0.0 == r1[1])  return GL_FALSE;
+   if (0.0F == r1[1])  return GL_FALSE;
 
    /* eliminate second variable */
    m2 = r2[1]/r1[1]; m3 = r3[1]/r1[1];
    r2[2] -= m2 * r1[2]; r3[2] -= m3 * r1[2];
    r2[3] -= m2 * r1[3]; r3[3] -= m3 * r1[3];
-   s = r1[4]; if (0.0 != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
-   s = r1[5]; if (0.0 != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
-   s = r1[6]; if (0.0 != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
-   s = r1[7]; if (0.0 != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
+   s = r1[4]; if (0.0F != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   s = r1[5]; if (0.0F != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   s = r1[6]; if (0.0F != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   s = r1[7]; if (0.0F != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
 
    /* choose pivot - or die */
    if (fabsf(r3[2])>fabsf(r2[2])) SWAP_ROWS(r3, r2);
-   if (0.0 == r2[2])  return GL_FALSE;
+   if (0.0F == r2[2])  return GL_FALSE;
 
    /* eliminate third variable */
    m3 = r3[2]/r2[2];
@@ -421,7 +421,7 @@ static GLboolean invert_matrix_general( GLmatrix *mat )
    r3[7] -= m3 * r2[7];
 
    /* last check */
-   if (0.0 == r3[3]) return GL_FALSE;
+   if (0.0F == r3[3]) return GL_FALSE;
 
    s = 1.0F/r3[3];             /* now back substitute row 3 */
    r3[4] *= s; r3[5] *= s; r3[6] *= s; r3[7] *= s;
@@ -490,26 +490,26 @@ static GLboolean invert_matrix_3d_general( GLmatrix *mat )
     */
    pos = neg = 0.0;
    t =  MAT(in,0,0) * MAT(in,1,1) * MAT(in,2,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t =  MAT(in,1,0) * MAT(in,2,1) * MAT(in,0,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t =  MAT(in,2,0) * MAT(in,0,1) * MAT(in,1,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,2,0) * MAT(in,1,1) * MAT(in,0,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,1,0) * MAT(in,0,1) * MAT(in,2,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    t = -MAT(in,0,0) * MAT(in,2,1) * MAT(in,1,2);
-   if (t >= 0.0) pos += t; else neg += t;
+   if (t >= 0.0F) pos += t; else neg += t;
 
    det = pos + neg;
 
-   if (fabsf(det) < 1e-25)
+   if (fabsf(det) < 1e-25F)
       return GL_FALSE;
 
    det = 1.0F / det;
@@ -564,7 +564,7 @@ static GLboolean invert_matrix_3d( GLmatrix *mat )
                        MAT(in,0,1) * MAT(in,0,1) +
                        MAT(in,0,2) * MAT(in,0,2));
 
-      if (scale == 0.0)
+      if (scale == 0.0F)
          return GL_FALSE;
 
       scale = 1.0F / scale;
@@ -799,8 +799,8 @@ _math_matrix_rotate( GLmatrix *mat,
    GLfloat m[16];
    GLboolean optimized;
 
-   s = (GLfloat) sin( angle * M_PI / 180.0 );
-   c = (GLfloat) cos( angle * M_PI / 180.0 );
+   s = sinf( angle * M_PI / 180.0 );
+   c = cosf( angle * M_PI / 180.0 );
 
    memcpy(m, Identity, sizeof(GLfloat)*16);
    optimized = GL_FALSE;
@@ -859,7 +859,7 @@ _math_matrix_rotate( GLmatrix *mat,
    if (!optimized) {
       const GLfloat mag = sqrtf(x * x + y * y + z * z);
 
-      if (mag <= 1.0e-4) {
+      if (mag <= 1.0e-4F) {
          /* no rotation, leave mat as-is */
          return;
       }
@@ -1070,7 +1070,7 @@ _math_matrix_scale( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z )
    m[2] *= x;   m[6] *= y;   m[10] *= z;
    m[3] *= x;   m[7] *= y;   m[11] *= z;
 
-   if (fabsf(x - y) < 1e-8 && fabsf(x - z) < 1e-8)
+   if (fabsf(x - y) < 1e-8F && fabsf(x - z) < 1e-8F)
       mat->flags |= MAT_FLAG_UNIFORM_SCALE;
    else
       mat->flags |= MAT_FLAG_GENERAL_SCALE;
@@ -1111,8 +1111,8 @@ _math_matrix_translate( GLmatrix *mat, GLfloat x, GLfloat y, GLfloat z )
  * Transforms Normalized Device Coords to window/Z values.
  */
 void
-_math_matrix_viewport(GLmatrix *m, const double scale[3],
-                      const double translate[3], double depthMax)
+_math_matrix_viewport(GLmatrix *m, const float scale[3],
+                      const float translate[3], double depthMax)
 {
    m->m[MAT_SX] = scale[0];
    m->m[MAT_TX] = translate[0];
@@ -1206,7 +1206,7 @@ static void analyse_from_scratch( GLmatrix *mat )
    GLuint i;
 
    for (i = 0 ; i < 16 ; i++) {
-      if (m[i] == 0.0) mask |= (1<<i);
+      if (m[i] == 0.0F) mask |= (1<<i);
    }
 
    if (m[0] == 1.0F) mask |= (1<<16);
@@ -1240,12 +1240,12 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_2D;
 
       /* Check for scale */
-      if (SQ(mm-1) > SQ(1e-6) ||
-	  SQ(m4m4-1) > SQ(1e-6))
+      if (SQ(mm-1) > SQ(1e-6F) ||
+	  SQ(m4m4-1) > SQ(1e-6F))
 	 mat->flags |= MAT_FLAG_GENERAL_SCALE;
 
       /* Check for rotation */
-      if (SQ(mm4) > SQ(1e-6))
+      if (SQ(mm4) > SQ(1e-6F))
 	 mat->flags |= MAT_FLAG_GENERAL_3D;
       else
 	 mat->flags |= MAT_FLAG_ROTATION;
@@ -1255,9 +1255,9 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_3D_NO_ROT;
 
       /* Check for scale */
-      if (SQ(m[0]-m[5]) < SQ(1e-6) &&
-	  SQ(m[0]-m[10]) < SQ(1e-6)) {
-	 if (SQ(m[0]-1.0) > SQ(1e-6)) {
+      if (SQ(m[0]-m[5]) < SQ(1e-6F) &&
+	  SQ(m[0]-m[10]) < SQ(1e-6F)) {
+	 if (SQ(m[0]-1.0F) > SQ(1e-6F)) {
 	    mat->flags |= MAT_FLAG_UNIFORM_SCALE;
          }
       }
@@ -1275,8 +1275,8 @@ static void analyse_from_scratch( GLmatrix *mat )
       mat->type = MATRIX_3D;
 
       /* Check for scale */
-      if (SQ(c1-c2) < SQ(1e-6) && SQ(c1-c3) < SQ(1e-6)) {
-	 if (SQ(c1-1.0) > SQ(1e-6))
+      if (SQ(c1-c2) < SQ(1e-6F) && SQ(c1-c3) < SQ(1e-6F)) {
+	 if (SQ(c1-1.0F) > SQ(1e-6F))
 	    mat->flags |= MAT_FLAG_UNIFORM_SCALE;
 	 /* else no scale at all */
       }
@@ -1285,10 +1285,10 @@ static void analyse_from_scratch( GLmatrix *mat )
       }
 
       /* Check for rotation */
-      if (SQ(d1) < SQ(1e-6)) {
+      if (SQ(d1) < SQ(1e-6F)) {
 	 CROSS3( cp, m, m+4 );
 	 SUB_3V( cp, cp, (m+8) );
-	 if (LEN_SQUARED_3FV(cp) < SQ(1e-6))
+	 if (LEN_SQUARED_3FV(cp) < SQ(1e-6F))
 	    mat->flags |= MAT_FLAG_ROTATION;
 	 else
 	    mat->flags |= MAT_FLAG_GENERAL_3D;
diff --git a/src/mesa/math/m_matrix.h b/src/mesa/math/m_matrix.h
index 778d716dce7..c34d9e3022f 100644
--- a/src/mesa/math/m_matrix.h
+++ b/src/mesa/math/m_matrix.h
@@ -122,8 +122,8 @@ _math_matrix_frustum( GLmatrix *mat,
 		      GLfloat nearval, GLfloat farval );
 
 extern void
-_math_matrix_viewport( GLmatrix *m, const double scale[3],
-                       const double translate[3], double depthMax );
+_math_matrix_viewport( GLmatrix *m, const float scale[3],
+                       const float translate[3], double depthMax );
 
 extern void
 _math_matrix_set_identity( GLmatrix *dest );
diff --git a/src/mesa/math/m_norm_tmp.h b/src/mesa/math/m_norm_tmp.h
index d3ec1c22ecd..6f1db8d0bd0 100644
--- a/src/mesa/math/m_norm_tmp.h
+++ b/src/mesa/math/m_norm_tmp.h
@@ -80,7 +80,7 @@ TAG(transform_normalize_normals)( const GLmatrix *mat,
       }
    }
    else {
-      if (scale != 1.0) {
+      if (scale != 1.0f) {
 	 m0 *= scale,  m4 *= scale,  m8 *= scale;
 	 m1 *= scale,  m5 *= scale,  m9 *= scale;
 	 m2 *= scale,  m6 *= scale,  m10 *= scale;
diff --git a/src/mesa/math/m_vector.h b/src/mesa/math/m_vector.h
index 8551ee7520e..5bd76b8987d 100644
--- a/src/mesa/math/m_vector.h
+++ b/src/mesa/math/m_vector.h
@@ -51,7 +51,7 @@
 
 /**
  * Wrap all the information about vectors up in a struct.  Has
- * additional fields compared to the other vectors to help us track of
+ * additional fields compared to the other vectors to help us track
  * different vertex sizes, and whether we need to clean columns out
  * because they contain non-(0,0,0,1) values.
  *
@@ -61,7 +61,7 @@
  */
 typedef struct {
    GLfloat (*data)[4];	/**< may be malloc'd or point to client data */
-   GLfloat *start;	/**< points somewhere inside of <data> */
+   GLfloat *start;	/**< points somewhere inside of GLvector4f::data */
    GLuint count;	/**< size of the vector (in elements) */
    GLuint stride;	/**< stride from one element to the next (in bytes) */
    GLuint size;		/**< 2-4 for vertices and 1-4 for texcoords */
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 3bffe90ff1f..b8b082e2a59 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -534,6 +534,7 @@ type_size(const struct glsl_type *type)
       return size;
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SUBROUTINE:
       /* Samplers take up one slot in UNIFORMS[], but they're baked in
        * at link time.
        */
@@ -1343,6 +1344,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
    case ir_unop_dFdx_fine:
    case ir_unop_dFdy_coarse:
    case ir_unop_dFdy_fine:
+   case ir_unop_subroutine_to_int:
       assert(!"not supported");
       break;
 
@@ -2385,7 +2387,7 @@ _mesa_generate_parameters_list_for_uniforms(struct gl_shader_program
       ir_variable *var = node->as_variable();
 
       if ((var == NULL) || (var->data.mode != ir_var_uniform)
-	  || var->is_in_uniform_block() || (strncmp(var->name, "gl_", 3) == 0))
+	  || var->is_in_buffer_block() || (strncmp(var->name, "gl_", 3) == 0))
 	 continue;
 
       add.process(var);
@@ -2452,6 +2454,7 @@ _mesa_associate_uniform_storage(struct gl_context *ctx,
 	    break;
 	 case GLSL_TYPE_SAMPLER:
 	 case GLSL_TYPE_IMAGE:
+         case GLSL_TYPE_SUBROUTINE:
 	    format = uniform_native;
 	    columns = 1;
 	    break;
@@ -2912,7 +2915,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 	 if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput
 	     || options->EmitNoIndirectTemp || options->EmitNoIndirectUniform)
 	   progress =
-	     lower_variable_index_to_cond_assign(ir,
+	     lower_variable_index_to_cond_assign(prog->_LinkedShaders[i]->Stage, ir,
 						 options->EmitNoIndirectInput,
 						 options->EmitNoIndirectOutput,
 						 options->EmitNoIndirectTemp,
@@ -2977,6 +2980,8 @@ _mesa_glsl_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
    if (prog->LinkStatus) {
       if (!ctx->Driver.LinkShader(ctx, prog)) {
 	 prog->LinkStatus = GL_FALSE;
+      } else {
+         build_program_resource_list(ctx, prog);
       }
    }
 
diff --git a/src/mesa/program/prog_execute.c b/src/mesa/program/prog_execute.c
index 46260b54882..2c52d0db508 100644
--- a/src/mesa/program/prog_execute.c
+++ b/src/mesa/program/prog_execute.c
@@ -623,7 +623,7 @@ _mesa_execute_program(struct gl_context * ctx,
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) cos(a[0]);
+               = cosf(a[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -723,7 +723,7 @@ _mesa_execute_program(struct gl_context * ctx,
                 * result.z = result.x * APPX(result.y)
                 * We do what the ARB extension says.
                 */
-               q[2] = (GLfloat) pow(2.0, t[0]);
+               q[2] = exp2f(t[0]);
             }
             q[1] = t[0] - floor_t0;
             q[3] = 1.0F;
@@ -734,7 +734,7 @@ _mesa_execute_program(struct gl_context * ctx,
          {
             GLfloat a[4], result[4], val;
             fetch_vector1(&inst->SrcReg[0], machine, a);
-            val = (GLfloat) pow(2.0, a[0]);
+            val = exp2f(a[0]);
             /*
             if (IS_INF_OR_NAN(val))
                val = 1.0e10;
@@ -776,7 +776,7 @@ _mesa_execute_program(struct gl_context * ctx,
             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
                GLfloat a[4];
                fetch_vector1(&inst->SrcReg[0], machine, a);
-               cond = (a[0] != 0.0);
+               cond = (a[0] != 0.0F);
             }
             else {
                cond = eval_condition(machine, inst);
@@ -834,7 +834,7 @@ _mesa_execute_program(struct gl_context * ctx,
                val = -FLT_MAX;
             }
             else {
-               val = (float)(log(a[0]) * 1.442695F);
+               val = logf(a[0]) * 1.442695F;
             }
             result[0] = result[1] = result[2] = result[3] = val;
             store_vector4(inst, machine, result);
@@ -853,10 +853,10 @@ _mesa_execute_program(struct gl_context * ctx,
             result[1] = a[0];
             /* XXX we could probably just use pow() here */
             if (a[0] > 0.0F) {
-               if (a[1] == 0.0 && a[3] == 0.0)
+               if (a[1] == 0.0F && a[3] == 0.0F)
                   result[2] = 1.0F;
                else
-                  result[2] = (GLfloat) pow(a[1], a[3]);
+                  result[2] = powf(a[1], a[3]);
             }
             else {
                result[2] = 0.0F;
@@ -886,12 +886,12 @@ _mesa_execute_program(struct gl_context * ctx,
                   int exponent;
                   GLfloat mantissa = frexpf(t[0], &exponent);
                   q[0] = (GLfloat) (exponent - 1);
-                  q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
+                  q[1] = 2.0F * mantissa; /* map [.5, 1) -> [1, 2) */
 
 		  /* The fast LOG2 macro doesn't meet the precision
 		   * requirements.
 		   */
-                  q[2] = (float)(log(t[0]) * 1.442695F);
+                  q[2] = logf(t[0]) * 1.442695F;
                }
             }
             else {
@@ -1051,7 +1051,7 @@ _mesa_execute_program(struct gl_context * ctx,
             fetch_vector1(&inst->SrcReg[0], machine, a);
             fetch_vector1(&inst->SrcReg[1], machine, b);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) pow(a[0], b[0]);
+               = powf(a[0], b[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -1095,10 +1095,10 @@ _mesa_execute_program(struct gl_context * ctx,
          {
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
-            result[0] = (GLfloat) cos(a[0]);
-            result[1] = (GLfloat) sin(a[0]);
-            result[2] = 0.0;    /* undefined! */
-            result[3] = 0.0;    /* undefined! */
+            result[0] = cosf(a[0]);
+            result[1] = sinf(a[0]);
+            result[2] = 0.0F;    /* undefined! */
+            result[3] = 0.0F;    /* undefined! */
             store_vector4(inst, machine, result);
          }
          break;
@@ -1161,7 +1161,7 @@ _mesa_execute_program(struct gl_context * ctx,
             GLfloat a[4], result[4];
             fetch_vector1(&inst->SrcReg[0], machine, a);
             result[0] = result[1] = result[2] = result[3]
-               = (GLfloat) sin(a[0]);
+               = sinf(a[0]);
             store_vector4(inst, machine, result);
          }
          break;
@@ -1360,7 +1360,7 @@ _mesa_execute_program(struct gl_context * ctx,
              * zero, we'd probably be fine except for an assert in
              * IROUND_POS() which gets triggered by the inf values created.
              */
-            if (texcoord[3] != 0.0) {
+            if (texcoord[3] != 0.0F) {
                texcoord[0] /= texcoord[3];
                texcoord[1] /= texcoord[3];
                texcoord[2] /= texcoord[3];
@@ -1380,7 +1380,7 @@ _mesa_execute_program(struct gl_context * ctx,
 
             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
-                texcoord[3] != 0.0) {
+                texcoord[3] != 0.0F) {
                texcoord[0] /= texcoord[3];
                texcoord[1] /= texcoord[3];
                texcoord[2] /= texcoord[3];
diff --git a/src/mesa/program/prog_opt_constant_fold.c b/src/mesa/program/prog_opt_constant_fold.c
index 3811c0d8aa6..e2518e660e6 100644
--- a/src/mesa/program/prog_opt_constant_fold.c
+++ b/src/mesa/program/prog_opt_constant_fold.c
@@ -38,6 +38,8 @@ src_regs_are_constant(const struct prog_instruction *inst, unsigned num_srcs)
    for (i = 0; i < num_srcs; i++) {
       if (inst->SrcReg[i].File != PROGRAM_CONSTANT)
 	 return false;
+      if (inst->SrcReg[i].RelAddr)
+         return false;
    }
 
    return true;
diff --git a/src/mesa/program/prog_print.c b/src/mesa/program/prog_print.c
index e4faa63c06f..bb7c2c6e527 100644
--- a/src/mesa/program/prog_print.c
+++ b/src/mesa/program/prog_print.c
@@ -147,6 +147,8 @@ arb_input_attrib_string(GLuint index, GLenum progType)
       "fragment.(twenty-one)", /* VARYING_SLOT_VIEWPORT */
       "fragment.(twenty-two)", /* VARYING_SLOT_FACE */
       "fragment.(twenty-three)", /* VARYING_SLOT_PNTC */
+      "fragment.(twenty-four)", /* VARYING_SLOT_TESS_LEVEL_OUTER */
+      "fragment.(twenty-five)", /* VARYING_SLOT_TESS_LEVEL_INNER */
       "fragment.varying[0]",
       "fragment.varying[1]",
       "fragment.varying[2]",
@@ -272,6 +274,8 @@ arb_output_attrib_string(GLuint index, GLenum progType)
       "result.(twenty-one)", /* VARYING_SLOT_VIEWPORT */
       "result.(twenty-two)", /* VARYING_SLOT_FACE */
       "result.(twenty-three)", /* VARYING_SLOT_PNTC */
+      "result.(twenty-four)", /* VARYING_SLOT_TESS_LEVEL_OUTER */
+      "result.(twenty-five)", /* VARYING_SLOT_TESS_LEVEL_INNER */
       "result.varying[0]",
       "result.varying[1]",
       "result.varying[2]",
@@ -1015,6 +1019,12 @@ _mesa_write_shader_to_file(const struct gl_shader *shader)
    case MESA_SHADER_FRAGMENT:
       type = "frag";
       break;
+   case MESA_SHADER_TESS_CTRL:
+      type = "tesc";
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      type = "tese";
+      break;
    case MESA_SHADER_VERTEX:
       type = "vert";
       break;
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index c13e61b1630..2d03bba3d12 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -285,6 +285,38 @@ _mesa_init_compute_program(struct gl_context *ctx,
 }
 
 
+/**
+ * Initialize a new tessellation control program object.
+ */
+struct gl_program *
+_mesa_init_tess_ctrl_program(struct gl_context *ctx,
+                             struct gl_tess_ctrl_program *prog,
+                             GLenum target, GLuint id)
+{
+   if (prog) {
+      init_program_struct(&prog->Base, target, id);
+      return &prog->Base;
+   }
+   return NULL;
+}
+
+
+/**
+ * Initialize a new tessellation evaluation program object.
+ */
+struct gl_program *
+_mesa_init_tess_eval_program(struct gl_context *ctx,
+                             struct gl_tess_eval_program *prog,
+                             GLenum target, GLuint id)
+{
+   if (prog) {
+      init_program_struct(&prog->Base, target, id);
+      return &prog->Base;
+   }
+   return NULL;
+}
+
+
 /**
  * Initialize a new geometry program object.
  */
@@ -333,6 +365,16 @@ _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id)
                                          CALLOC_STRUCT(gl_geometry_program),
                                          target, id);
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      prog = _mesa_init_tess_ctrl_program(ctx,
+                                          CALLOC_STRUCT(gl_tess_ctrl_program),
+                                          target, id);
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      prog = _mesa_init_tess_eval_program(ctx,
+                                         CALLOC_STRUCT(gl_tess_eval_program),
+                                         target, id);
+      break;
    case GL_COMPUTE_PROGRAM_NV:
       prog = _mesa_init_compute_program(ctx,
                                         CALLOC_STRUCT(gl_compute_program),
@@ -554,6 +596,23 @@ _mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog)
          gpc->UsesStreams = gp->UsesStreams;
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         const struct gl_tess_ctrl_program *tcp = gl_tess_ctrl_program_const(prog);
+         struct gl_tess_ctrl_program *tcpc = gl_tess_ctrl_program(clone);
+         tcpc->VerticesOut = tcp->VerticesOut;
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         const struct gl_tess_eval_program *tep = gl_tess_eval_program_const(prog);
+         struct gl_tess_eval_program *tepc = gl_tess_eval_program(clone);
+         tepc->PrimitiveMode = tep->PrimitiveMode;
+         tepc->Spacing = tep->Spacing;
+         tepc->VertexOrder = tep->VertexOrder;
+         tepc->PointMode = tep->PointMode;
+      }
+      break;
    default:
       _mesa_problem(NULL, "Unexpected target in _mesa_clone_program");
    }
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 2d92ab2f118..a894147cafd 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -78,6 +78,16 @@ _mesa_init_fragment_program(struct gl_context *ctx,
                             struct gl_fragment_program *prog,
                             GLenum target, GLuint id);
 
+extern struct gl_program *
+_mesa_init_tess_ctrl_program(struct gl_context *ctx,
+                            struct gl_tess_ctrl_program *prog,
+                            GLenum target, GLuint id);
+
+extern struct gl_program *
+_mesa_init_tess_eval_program(struct gl_context *ctx,
+                            struct gl_tess_eval_program *prog,
+                            GLenum target, GLuint id);
+
 extern struct gl_program *
 _mesa_init_geometry_program(struct gl_context *ctx,
                             struct gl_geometry_program *prog,
@@ -147,6 +157,25 @@ _mesa_reference_compprog(struct gl_context *ctx,
                            (struct gl_program *) prog);
 }
 
+
+static inline void
+_mesa_reference_tesscprog(struct gl_context *ctx,
+                         struct gl_tess_ctrl_program **ptr,
+                         struct gl_tess_ctrl_program *prog)
+{
+   _mesa_reference_program(ctx, (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
+static inline void
+_mesa_reference_tesseprog(struct gl_context *ctx,
+                         struct gl_tess_eval_program **ptr,
+                         struct gl_tess_eval_program *prog)
+{
+   _mesa_reference_program(ctx, (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
 extern struct gl_program *
 _mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog);
 
@@ -157,6 +186,20 @@ _mesa_clone_vertex_program(struct gl_context *ctx,
    return (struct gl_vertex_program *) _mesa_clone_program(ctx, &prog->Base);
 }
 
+static inline struct gl_tess_ctrl_program *
+_mesa_clone_tess_ctrl_program(struct gl_context *ctx,
+                             const struct gl_tess_ctrl_program *prog)
+{
+   return (struct gl_tess_ctrl_program *) _mesa_clone_program(ctx, &prog->Base);
+}
+
+static inline struct gl_tess_eval_program *
+_mesa_clone_tess_eval_program(struct gl_context *ctx,
+                             const struct gl_tess_eval_program *prog)
+{
+   return (struct gl_tess_eval_program *) _mesa_clone_program(ctx, &prog->Base);
+}
+
 static inline struct gl_geometry_program *
 _mesa_clone_geometry_program(struct gl_context *ctx,
                              const struct gl_geometry_program *prog)
@@ -216,6 +259,10 @@ _mesa_program_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_PROGRAM_NV:
       return MESA_SHADER_GEOMETRY;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      return MESA_SHADER_TESS_CTRL;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      return MESA_SHADER_TESS_EVAL;
    case GL_COMPUTE_PROGRAM_NV:
       return MESA_SHADER_COMPUTE;
    default:
@@ -235,6 +282,10 @@ _mesa_shader_stage_to_program(unsigned stage)
       return GL_FRAGMENT_PROGRAM_ARB;
    case MESA_SHADER_GEOMETRY:
       return GL_GEOMETRY_PROGRAM_NV;
+   case MESA_SHADER_TESS_CTRL:
+      return GL_TESS_CONTROL_PROGRAM_NV;
+   case MESA_SHADER_TESS_EVAL:
+      return GL_TESS_EVALUATION_PROGRAM_NV;
    case MESA_SHADER_COMPUTE:
       return GL_COMPUTE_PROGRAM_NV;
    }
@@ -244,7 +295,9 @@ _mesa_shader_stage_to_program(unsigned stage)
 }
 
 
-/* Cast wrappers from gl_program to gl_vertex/geometry/fragment_program */
+/* Cast wrappers from gl_program to derived program types.
+ * (e.g. gl_vertex_program)
+ */
 
 static inline struct gl_fragment_program *
 gl_fragment_program(struct gl_program *prog)
@@ -297,6 +350,31 @@ gl_compute_program_const(const struct gl_program *prog)
    return (const struct gl_compute_program *) prog;
 }
 
+static inline struct gl_tess_ctrl_program *
+gl_tess_ctrl_program(struct gl_program *prog)
+{
+   return (struct gl_tess_ctrl_program *) prog;
+}
+
+static inline const struct gl_tess_ctrl_program *
+gl_tess_ctrl_program_const(const struct gl_program *prog)
+{
+   return (const struct gl_tess_ctrl_program *) prog;
+}
+
+
+static inline struct gl_tess_eval_program *
+gl_tess_eval_program(struct gl_program *prog)
+{
+   return (struct gl_tess_eval_program *) prog;
+}
+
+static inline const struct gl_tess_eval_program *
+gl_tess_eval_program_const(const struct gl_program *prog)
+{
+   return (const struct gl_tess_eval_program *) prog;
+}
+
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/mesa/program/program_parse_extra.c b/src/mesa/program/program_parse_extra.c
index 32b54afc57b..71f86d13ace 100644
--- a/src/mesa/program/program_parse_extra.c
+++ b/src/mesa/program/program_parse_extra.c
@@ -163,6 +163,8 @@ _mesa_ARBvp_parse_option(struct asm_parser_state *state, const char *option)
 int
 _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option)
 {
+   unsigned fog_option;
+
    /* All of the options currently supported start with "ARB_".  The code is
     * currently structured with nested if-statements because eventually options
     * that start with "NV_" will be supported.  This structure will result in
@@ -177,20 +179,42 @@ _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option)
       if (strncmp(option, "fog_", 4) == 0) {
 	 option += 4;
 
-	 if (state->option.Fog == OPTION_NONE) {
-	    if (strcmp(option, "exp") == 0) {
-	       state->option.Fog = OPTION_FOG_EXP;
-	       return 1;
-	    } else if (strcmp(option, "exp2") == 0) {
-	       state->option.Fog = OPTION_FOG_EXP2;
-	       return 1;
-	    } else if (strcmp(option, "linear") == 0) {
-	       state->option.Fog = OPTION_FOG_LINEAR;
-	       return 1;
-	    }
-	 }
+         if (strcmp(option, "exp") == 0) {
+            fog_option = OPTION_FOG_EXP;
+         } else if (strcmp(option, "exp2") == 0) {
+            fog_option = OPTION_FOG_EXP2;
+         } else if (strcmp(option, "linear") == 0) {
+            fog_option = OPTION_FOG_LINEAR;
+         } else {
+            /* invalid option */
+            return 0;
+         }
 
-	 return 0;
+         if (state->option.Fog == OPTION_NONE) {
+            state->option.Fog = fog_option;
+            return 1;
+         }
+
+         /* The ARB_fragment_program specification instructs us to handle
+          * redundant options in two seemingly contradictory ways:
+          *
+          * Section 3.11.4.5.1 says:
+          * "Only one fog application option may be specified by any given
+          *  fragment program.  A fragment program that specifies more than one
+          *  of the program options "ARB_fog_exp", "ARB_fog_exp2", and
+          *  "ARB_fog_linear", will fail to load."
+          *
+          * Issue 27 says:
+          * "The three mandatory options are ARB_fog_exp, ARB_fog_exp2, and
+          *  ARB_fog_linear.  As these options are mutually exclusive by
+          *  nature, specifying more than one is not useful.  If more than one
+          *  is specified, the last one encountered in the <optionSequence>
+          *  will be the one to actually modify the execution environment."
+          *
+          * We choose to allow programs to specify the same OPTION redundantly,
+          * but fail to load programs that specify contradictory options.
+          */
+         return state->option.Fog == fog_option ? 1 : 0;
       } else if (strncmp(option, "precision_hint_", 15) == 0) {
 	 option += 15;
 
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 428f2d9d7d7..43dbadd4a7e 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -46,9 +46,10 @@ static const struct st_tracked_state *atoms[] =
    &st_update_depth_stencil_alpha,
    &st_update_clip,
 
-   &st_finalize_textures,
    &st_update_fp,
    &st_update_gp,
+   &st_update_tep,
+   &st_update_tcp,
    &st_update_vp,
 
    &st_update_rasterizer,
@@ -59,17 +60,24 @@ static const struct st_tracked_state *atoms[] =
    &st_update_vertex_texture,
    &st_update_fragment_texture,
    &st_update_geometry_texture,
+   &st_update_tessctrl_texture,
+   &st_update_tesseval_texture,
    &st_update_sampler, /* depends on update_*_texture for swizzle */
    &st_update_framebuffer,
    &st_update_msaa,
    &st_update_sample_shading,
    &st_update_vs_constants,
+   &st_update_tcs_constants,
+   &st_update_tes_constants,
    &st_update_gs_constants,
    &st_update_fs_constants,
    &st_bind_vs_ubos,
+   &st_bind_tcs_ubos,
+   &st_bind_tes_ubos,
    &st_bind_fs_ubos,
    &st_bind_gs_ubos,
    &st_update_pixel_transfer,
+   &st_update_tess,
 
    /* this must be done after the vertex program update */
    &st_update_array
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index c50111d501f..a24842baa4f 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -52,6 +52,8 @@ extern const struct st_tracked_state st_update_clip;
 extern const struct st_tracked_state st_update_depth_stencil_alpha;
 extern const struct st_tracked_state st_update_fp;
 extern const struct st_tracked_state st_update_gp;
+extern const struct st_tracked_state st_update_tep;
+extern const struct st_tracked_state st_update_tcp;
 extern const struct st_tracked_state st_update_vp;
 extern const struct st_tracked_state st_update_rasterizer;
 extern const struct st_tracked_state st_update_polygon_stipple;
@@ -64,14 +66,20 @@ extern const struct st_tracked_state st_update_sampler;
 extern const struct st_tracked_state st_update_fragment_texture;
 extern const struct st_tracked_state st_update_vertex_texture;
 extern const struct st_tracked_state st_update_geometry_texture;
-extern const struct st_tracked_state st_finalize_textures;
+extern const struct st_tracked_state st_update_tessctrl_texture;
+extern const struct st_tracked_state st_update_tesseval_texture;
 extern const struct st_tracked_state st_update_fs_constants;
 extern const struct st_tracked_state st_update_gs_constants;
+extern const struct st_tracked_state st_update_tes_constants;
+extern const struct st_tracked_state st_update_tcs_constants;
 extern const struct st_tracked_state st_update_vs_constants;
 extern const struct st_tracked_state st_bind_fs_ubos;
 extern const struct st_tracked_state st_bind_vs_ubos;
 extern const struct st_tracked_state st_bind_gs_ubos;
+extern const struct st_tracked_state st_bind_tcs_ubos;
+extern const struct st_tracked_state st_bind_tes_ubos;
 extern const struct st_tracked_state st_update_pixel_transfer;
+extern const struct st_tracked_state st_update_tess;
 
 
 GLuint st_compare_func_to_pipe(GLenum func);
diff --git a/src/mesa/state_tracker/st_atom_clip.c b/src/mesa/state_tracker/st_atom_clip.c
index f82c1332afc..506a770499f 100644
--- a/src/mesa/state_tracker/st_atom_clip.c
+++ b/src/mesa/state_tracker/st_atom_clip.c
@@ -59,8 +59,11 @@ static void update_clip( struct st_context *st )
    memcpy(clip.ucp,
           use_eye ? ctx->Transform.EyeUserPlane
                   : ctx->Transform._ClipUserPlane, sizeof(clip.ucp));
-   st->state.clip = clip;
-   cso_set_clip(st->cso_context, &clip);
+
+   if (memcmp(&st->state.clip, &clip, sizeof(clip)) != 0) {
+      st->state.clip = clip;
+      st->pipe->set_clip_state(st->pipe, &clip);
+   }
 }
 
 
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index a54e0d9dbf5..6affb4d84d5 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -59,7 +59,9 @@ void st_upload_constants( struct st_context *st,
 {
    assert(shader_type == PIPE_SHADER_VERTEX ||
           shader_type == PIPE_SHADER_FRAGMENT ||
-          shader_type == PIPE_SHADER_GEOMETRY);
+          shader_type == PIPE_SHADER_GEOMETRY ||
+          shader_type == PIPE_SHADER_TESS_CTRL ||
+          shader_type == PIPE_SHADER_TESS_EVAL);
 
    /* update constants */
    if (params && params->NumParameters) {
@@ -178,6 +180,50 @@ const struct st_tracked_state st_update_gs_constants = {
    update_gs_constants					/* update */
 };
 
+/* Tessellation control shader:
+ */
+static void update_tcs_constants(struct st_context *st )
+{
+   struct st_tessctrl_program *tcp = st->tcp;
+   struct gl_program_parameter_list *params;
+
+   if (tcp) {
+      params = tcp->Base.Base.Parameters;
+      st_upload_constants( st, params, PIPE_SHADER_TESS_CTRL );
+   }
+}
+
+const struct st_tracked_state st_update_tcs_constants = {
+   "st_update_tcs_constants",				/* name */
+   {							/* dirty */
+      _NEW_PROGRAM_CONSTANTS,                           /* mesa */
+      ST_NEW_TESSCTRL_PROGRAM,				/* st */
+   },
+   update_tcs_constants					/* update */
+};
+
+/* Tessellation evaluation shader:
+ */
+static void update_tes_constants(struct st_context *st )
+{
+   struct st_tesseval_program *tep = st->tep;
+   struct gl_program_parameter_list *params;
+
+   if (tep) {
+      params = tep->Base.Base.Parameters;
+      st_upload_constants( st, params, PIPE_SHADER_TESS_EVAL );
+   }
+}
+
+const struct st_tracked_state st_update_tes_constants = {
+   "st_update_tes_constants",				/* name */
+   {							/* dirty */
+      _NEW_PROGRAM_CONSTANTS,                           /* mesa */
+      ST_NEW_TESSEVAL_PROGRAM,				/* st */
+   },
+   update_tes_constants					/* update */
+};
+
 static void st_bind_ubos(struct st_context *st,
                            struct gl_shader *shader,
                            unsigned shader_type)
@@ -275,3 +321,43 @@ const struct st_tracked_state st_bind_gs_ubos = {
    },
    bind_gs_ubos
 };
+
+static void bind_tcs_ubos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
+
+   if (!prog)
+      return;
+
+   st_bind_ubos(st, prog->_LinkedShaders[MESA_SHADER_TESS_CTRL], PIPE_SHADER_TESS_CTRL);
+}
+
+const struct st_tracked_state st_bind_tcs_ubos = {
+   "st_bind_tcs_ubos",
+   {
+      0,
+      ST_NEW_TESSCTRL_PROGRAM | ST_NEW_UNIFORM_BUFFER,
+   },
+   bind_tcs_ubos
+};
+
+static void bind_tes_ubos(struct st_context *st)
+{
+   struct gl_shader_program *prog =
+      st->ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+
+   if (!prog)
+      return;
+
+   st_bind_ubos(st, prog->_LinkedShaders[MESA_SHADER_TESS_EVAL], PIPE_SHADER_TESS_EVAL);
+}
+
+const struct st_tracked_state st_bind_tes_ubos = {
+   "st_bind_tes_ubos",
+   {
+      0,
+      ST_NEW_TESSEVAL_PROGRAM | ST_NEW_UNIFORM_BUFFER,
+   },
+   bind_tes_ubos
+};
diff --git a/src/mesa/state_tracker/st_atom_depth.c b/src/mesa/state_tracker/st_atom_depth.c
index c4bca8d09b5..d9cc97029fb 100644
--- a/src/mesa/state_tracker/st_atom_depth.c
+++ b/src/mesa/state_tracker/st_atom_depth.c
@@ -105,10 +105,17 @@ update_depth_stencil_alpha(struct st_context *st)
    memset(dsa, 0, sizeof(*dsa));
    memset(&sr, 0, sizeof(sr));
 
-   if (ctx->Depth.Test && ctx->DrawBuffer->Visual.depthBits > 0) {
-      dsa->depth.enabled = 1;
-      dsa->depth.writemask = ctx->Depth.Mask;
-      dsa->depth.func = st_compare_func_to_pipe(ctx->Depth.Func);
+   if (ctx->DrawBuffer->Visual.depthBits > 0) {
+      if (ctx->Depth.Test) {
+         dsa->depth.enabled = 1;
+         dsa->depth.writemask = ctx->Depth.Mask;
+         dsa->depth.func = st_compare_func_to_pipe(ctx->Depth.Func);
+      }
+      if (ctx->Depth.BoundsTest) {
+         dsa->depth.bounds_test = 1;
+         dsa->depth.bounds_min = ctx->Depth.BoundsMin;
+         dsa->depth.bounds_max = ctx->Depth.BoundsMax;
+      }
    }
 
    if (ctx->Stencil.Enabled && ctx->DrawBuffer->Visual.stencilBits > 0) {
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index b68eb16d7be..4252c27962e 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -245,6 +245,7 @@ update_shader_samplers(struct st_context *st,
    GLuint unit;
    GLbitfield samplers_used;
    const GLuint old_max = *num_samplers;
+   const struct pipe_sampler_state *states[PIPE_MAX_SAMPLERS];
 
    samplers_used = prog->SamplersUsed;
 
@@ -261,13 +262,11 @@ update_shader_samplers(struct st_context *st,
          const GLuint texUnit = prog->SamplerUnits[unit];
 
          convert_sampler(st, sampler, texUnit);
-
+         states[unit] = sampler;
          *num_samplers = unit + 1;
-
-         cso_single_sampler(st->cso_context, shader_stage, unit, sampler);
       }
       else if (samplers_used != 0 || unit < old_max) {
-         cso_single_sampler(st->cso_context, shader_stage, unit, NULL);
+         states[unit] = NULL;
       }
       else {
          /* if we've reset all the old samplers and we have no more new ones */
@@ -275,7 +274,7 @@ update_shader_samplers(struct st_context *st,
       }
    }
 
-   cso_single_sampler_done(st->cso_context, shader_stage);
+   cso_set_samplers(st->cso_context, shader_stage, *num_samplers, states);
 }
 
 
@@ -306,6 +305,22 @@ update_samplers(struct st_context *st)
                              st->state.samplers[PIPE_SHADER_GEOMETRY],
                              &st->state.num_samplers[PIPE_SHADER_GEOMETRY]);
    }
+   if (ctx->TessCtrlProgram._Current) {
+      update_shader_samplers(st,
+                             PIPE_SHADER_TESS_CTRL,
+                             &ctx->TessCtrlProgram._Current->Base,
+                             ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
+                             st->state.samplers[PIPE_SHADER_TESS_CTRL],
+                             &st->state.num_samplers[PIPE_SHADER_TESS_CTRL]);
+   }
+   if (ctx->TessEvalProgram._Current) {
+      update_shader_samplers(st,
+                             PIPE_SHADER_TESS_EVAL,
+                             &ctx->TessEvalProgram._Current->Base,
+                             ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
+                             st->state.samplers[PIPE_SHADER_TESS_EVAL],
+                             &st->state.num_samplers[PIPE_SHADER_TESS_EVAL]);
+   }
 }
 
 
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index ad8d2624fc9..fee15a980f3 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -49,24 +49,6 @@
 #include "st_program.h"
 
 
-/**
- * Return pointer to a pass-through fragment shader.
- * This shader is used when a texture is missing/incomplete.
- */
-static void *
-get_passthrough_fs(struct st_context *st)
-{
-   if (!st->passthrough_fs) {
-      st->passthrough_fs =
-         util_make_fragment_passthrough_shader(st->pipe, TGSI_SEMANTIC_COLOR,
-                                               TGSI_INTERPOLATE_PERSPECTIVE,
-                                               TRUE);
-   }
-
-   return st->passthrough_fs;
-}
-
-
 /**
  * Update fragment program state/atom.  This involves translating the
  * Mesa fragment program into a gallium fragment program and binding it.
@@ -96,15 +78,8 @@ update_fp( struct st_context *st )
 
    st_reference_fragprog(st, &st->fp, stfp);
 
-   if (st->missing_textures) {
-      /* use a pass-through frag shader that uses no textures */
-      void *fs = get_passthrough_fs(st);
-      cso_set_fragment_shader_handle(st->cso_context, fs);
-   }
-   else {
-      cso_set_fragment_shader_handle(st->cso_context,
-                                     st->fp_variant->driver_shader);
-   }
+   cso_set_fragment_shader_handle(st->cso_context,
+                                  st->fp_variant->driver_shader);
 }
 
 
@@ -210,3 +185,75 @@ const struct st_tracked_state st_update_gp = {
    },
    update_gp  				/* update */
 };
+
+
+
+static void
+update_tcp( struct st_context *st )
+{
+   struct st_tessctrl_program *sttcp;
+   struct st_tcp_variant_key key;
+
+   if (!st->ctx->TessCtrlProgram._Current) {
+      cso_set_tessctrl_shader_handle(st->cso_context, NULL);
+      return;
+   }
+
+   sttcp = st_tessctrl_program(st->ctx->TessCtrlProgram._Current);
+   assert(sttcp->Base.Base.Target == GL_TESS_CONTROL_PROGRAM_NV);
+
+   memset(&key, 0, sizeof(key));
+   key.st = st;
+
+   st->tcp_variant = st_get_tcp_variant(st, sttcp, &key);
+
+   st_reference_tesscprog(st, &st->tcp, sttcp);
+
+   cso_set_tessctrl_shader_handle(st->cso_context,
+                                  st->tcp_variant->driver_shader);
+}
+
+const struct st_tracked_state st_update_tcp = {
+   "st_update_tcp",			/* name */
+   {					/* dirty */
+      0,				/* mesa */
+      ST_NEW_TESSCTRL_PROGRAM           /* st */
+   },
+   update_tcp  				/* update */
+};
+
+
+
+static void
+update_tep( struct st_context *st )
+{
+   struct st_tesseval_program *sttep;
+   struct st_tep_variant_key key;
+
+   if (!st->ctx->TessEvalProgram._Current) {
+      cso_set_tesseval_shader_handle(st->cso_context, NULL);
+      return;
+   }
+
+   sttep = st_tesseval_program(st->ctx->TessEvalProgram._Current);
+   assert(sttep->Base.Base.Target == GL_TESS_EVALUATION_PROGRAM_NV);
+
+   memset(&key, 0, sizeof(key));
+   key.st = st;
+
+   st->tep_variant = st_get_tep_variant(st, sttep, &key);
+
+   st_reference_tesseprog(st, &st->tep, sttep);
+
+   cso_set_tesseval_shader_handle(st->cso_context,
+                                  st->tep_variant->driver_shader);
+}
+
+const struct st_tracked_state st_update_tep = {
+   "st_update_tep",			/* name */
+   {					/* dirty */
+      0,				/* mesa */
+      ST_NEW_TESSEVAL_PROGRAM           /* st */
+   },
+   update_tep  				/* update */
+};
diff --git a/src/mesa/state_tracker/st_atom_tess.c b/src/mesa/state_tracker/st_atom_tess.c
new file mode 100644
index 00000000000..8e6287a900c
--- /dev/null
+++ b/src/mesa/state_tracker/st_atom_tess.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * Authors:
+ *   Marek Olšák <maraeo@gmail.com>
+ */
+
+
+#include "main/macros.h"
+#include "st_context.h"
+#include "pipe/p_context.h"
+#include "st_atom.h"
+
+
+static void
+update_tess(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+   struct pipe_context *pipe = st->pipe;
+
+   if (!pipe->set_tess_state)
+      return;
+
+   pipe->set_tess_state(pipe,
+                        ctx->TessCtrlProgram.patch_default_outer_level,
+                        ctx->TessCtrlProgram.patch_default_inner_level);
+}
+
+
+const struct st_tracked_state st_update_tess = {
+   "update_tess",		/* name */
+   {				/* dirty */
+      0,			/* mesa */
+      ST_NEW_TESS_STATE,	/* st */
+   },
+   update_tess                  /* update */
+};
diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index 04ba86448fc..31e0f6ba06c 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -103,7 +103,8 @@ swizzle_swizzle(unsigned swizzle1, unsigned swizzle2)
  */
 static unsigned
 compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
-                               enum pipe_format actualFormat)
+                               enum pipe_format actualFormat,
+                               unsigned glsl_version)
 {
    switch (baseFormat) {
    case GL_RGBA:
@@ -157,8 +158,26 @@ compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
       case GL_INTENSITY:
          return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
       case GL_ALPHA:
-         return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
-                              SWIZZLE_ZERO, SWIZZLE_X);
+         /* The texture(sampler*Shadow) functions from GLSL 1.30 ignore
+          * the depth mode and return float, while older shadow* functions
+          * and ARB_fp instructions return vec4 according to the depth mode.
+          *
+          * The problem with the GLSL 1.30 functions is that GL_ALPHA forces
+          * them to return 0, breaking them completely.
+          *
+          * A proper fix would increase code complexity and that's not worth
+          * it for a rarely used feature such as the GL_ALPHA depth mode
+          * in GL3. Therefore, change GL_ALPHA to GL_INTENSITY for all
+          * shaders that use GLSL 1.30 or later.
+          *
+          * BTW, it's required that sampler views are updated when
+          * shaders change (check_sampler_swizzle takes care of that).
+          */
+         if (glsl_version && glsl_version >= 130)
+            return SWIZZLE_XXXX;
+         else
+            return MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO,
+                                 SWIZZLE_ZERO, SWIZZLE_X);
       case GL_RED:
          return MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO,
                               SWIZZLE_ZERO, SWIZZLE_ONE);
@@ -174,7 +193,8 @@ compute_texture_format_swizzle(GLenum baseFormat, GLenum depthMode,
 
 
 static unsigned
-get_texture_format_swizzle(const struct st_texture_object *stObj)
+get_texture_format_swizzle(const struct st_texture_object *stObj,
+                           unsigned glsl_version)
 {
    GLenum baseFormat = _mesa_texture_base_format(&stObj->base);
    unsigned tex_swizzle;
@@ -182,7 +202,8 @@ get_texture_format_swizzle(const struct st_texture_object *stObj)
    if (baseFormat != GL_NONE) {
       tex_swizzle = compute_texture_format_swizzle(baseFormat,
                                                    stObj->base.DepthMode,
-                                                   stObj->pt->format);
+                                                   stObj->pt->format,
+                                                   glsl_version);
    }
    else {
       tex_swizzle = SWIZZLE_XYZW;
@@ -201,9 +222,9 @@ get_texture_format_swizzle(const struct st_texture_object *stObj)
  */
 static boolean
 check_sampler_swizzle(const struct st_texture_object *stObj,
-		      struct pipe_sampler_view *sv)
+		      struct pipe_sampler_view *sv, unsigned glsl_version)
 {
-   unsigned swizzle = get_texture_format_swizzle(stObj);
+   unsigned swizzle = get_texture_format_swizzle(stObj, glsl_version);
 
    return ((sv->swizzle_r != GET_SWZ(swizzle, 0)) ||
            (sv->swizzle_g != GET_SWZ(swizzle, 1)) ||
@@ -232,11 +253,11 @@ static unsigned last_layer(struct st_texture_object *stObj)
 static struct pipe_sampler_view *
 st_create_texture_sampler_view_from_stobj(struct pipe_context *pipe,
 					  struct st_texture_object *stObj,
-                                          const struct gl_sampler_object *samp,
-					  enum pipe_format format)
+					  enum pipe_format format,
+                                          unsigned glsl_version)
 {
    struct pipe_sampler_view templ;
-   unsigned swizzle = get_texture_format_swizzle(stObj);
+   unsigned swizzle = get_texture_format_swizzle(stObj, glsl_version);
 
    u_sampler_view_default_template(&templ,
                                    stObj->pt,
@@ -283,8 +304,8 @@ st_create_texture_sampler_view_from_stobj(struct pipe_context *pipe,
 static struct pipe_sampler_view *
 st_get_texture_sampler_view_from_stobj(struct st_context *st,
                                        struct st_texture_object *stObj,
-                                       const struct gl_sampler_object *samp,
-				       enum pipe_format format)
+				       enum pipe_format format,
+                                       unsigned glsl_version)
 {
    struct pipe_sampler_view **sv;
    const struct st_texture_image *firstImage;
@@ -306,7 +327,7 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
 
    /* if sampler view has changed dereference it */
    if (*sv) {
-      if (check_sampler_swizzle(stObj, *sv) ||
+      if (check_sampler_swizzle(stObj, *sv, glsl_version) ||
 	  (format != (*sv)->format) ||
           gl_target_to_pipe(stObj->base.Target) != (*sv)->target ||
           stObj->base.MinLevel + stObj->base.BaseLevel != (*sv)->u.tex.first_level ||
@@ -318,7 +339,8 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
    }
 
    if (!*sv) {
-      *sv = st_create_texture_sampler_view_from_stobj(st->pipe, stObj, samp, format);
+      *sv = st_create_texture_sampler_view_from_stobj(st->pipe, stObj,
+                                                      format, glsl_version);
 
    } else if ((*sv)->context != st->pipe) {
       /* Recreate view in correct context, use existing view as template */
@@ -334,7 +356,7 @@ st_get_texture_sampler_view_from_stobj(struct st_context *st,
 static GLboolean
 update_single_texture(struct st_context *st,
                       struct pipe_sampler_view **sampler_view,
-		      GLuint texUnit)
+		      GLuint texUnit, unsigned glsl_version)
 {
    struct gl_context *ctx = st->ctx;
    const struct gl_sampler_object *samp;
@@ -374,8 +396,9 @@ update_single_texture(struct st_context *st,
       }
    }
 
-   *sampler_view = st_get_texture_sampler_view_from_stobj(st, stObj, samp,
-							  view_format);
+   *sampler_view =
+      st_get_texture_sampler_view_from_stobj(st, stObj, view_format,
+                                             glsl_version);
    return GL_TRUE;
 }
 
@@ -383,7 +406,7 @@ update_single_texture(struct st_context *st,
 
 static void
 update_textures(struct st_context *st,
-                unsigned shader_stage,
+                gl_shader_stage mesa_shader,
                 const struct gl_program *prog,
                 unsigned max_units,
                 struct pipe_sampler_view **sampler_views,
@@ -392,6 +415,10 @@ update_textures(struct st_context *st,
    const GLuint old_max = *num_textures;
    GLbitfield samplers_used = prog->SamplersUsed;
    GLuint unit;
+   struct gl_shader_program *shader =
+      st->ctx->_Shader->CurrentProgram[mesa_shader];
+   unsigned glsl_version = shader ? shader->Version : 0;
+   unsigned shader_stage = st_shader_stage_to_ptarget(mesa_shader);
 
    if (samplers_used == 0x0 && old_max == 0)
       return;
@@ -406,7 +433,8 @@ update_textures(struct st_context *st,
          const GLuint texUnit = prog->SamplerUnits[unit];
          GLboolean retval;
 
-         retval = update_single_texture(st, &sampler_view, texUnit);
+         retval = update_single_texture(st, &sampler_view, texUnit,
+                                        glsl_version);
          if (retval == GL_FALSE)
             continue;
 
@@ -435,7 +463,7 @@ update_vertex_textures(struct st_context *st)
 
    if (ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits > 0) {
       update_textures(st,
-                      PIPE_SHADER_VERTEX,
+                      MESA_SHADER_VERTEX,
                       &ctx->VertexProgram._Current->Base,
                       ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_VERTEX],
@@ -450,7 +478,7 @@ update_fragment_textures(struct st_context *st)
    const struct gl_context *ctx = st->ctx;
 
    update_textures(st,
-                   PIPE_SHADER_FRAGMENT,
+                   MESA_SHADER_FRAGMENT,
                    &ctx->FragmentProgram._Current->Base,
                    ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
                    st->state.sampler_views[PIPE_SHADER_FRAGMENT],
@@ -465,7 +493,7 @@ update_geometry_textures(struct st_context *st)
 
    if (ctx->GeometryProgram._Current) {
       update_textures(st,
-                      PIPE_SHADER_GEOMETRY,
+                      MESA_SHADER_GEOMETRY,
                       &ctx->GeometryProgram._Current->Base,
                       ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits,
                       st->state.sampler_views[PIPE_SHADER_GEOMETRY],
@@ -474,11 +502,43 @@ update_geometry_textures(struct st_context *st)
 }
 
 
+static void
+update_tessctrl_textures(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+
+   if (ctx->TessCtrlProgram._Current) {
+      update_textures(st,
+                      MESA_SHADER_TESS_CTRL,
+                      &ctx->TessCtrlProgram._Current->Base,
+                      ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits,
+                      st->state.sampler_views[PIPE_SHADER_TESS_CTRL],
+                      &st->state.num_sampler_views[PIPE_SHADER_TESS_CTRL]);
+   }
+}
+
+
+static void
+update_tesseval_textures(struct st_context *st)
+{
+   const struct gl_context *ctx = st->ctx;
+
+   if (ctx->TessEvalProgram._Current) {
+      update_textures(st,
+                      MESA_SHADER_TESS_EVAL,
+                      &ctx->TessEvalProgram._Current->Base,
+                      ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits,
+                      st->state.sampler_views[PIPE_SHADER_TESS_EVAL],
+                      &st->state.num_sampler_views[PIPE_SHADER_TESS_EVAL]);
+   }
+}
+
+
 const struct st_tracked_state st_update_fragment_texture = {
    "st_update_texture",					/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_FRAGMENT_PROGRAM,				/* st */
+      ST_NEW_FRAGMENT_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_fragment_textures				/* update */
 };
@@ -488,7 +548,7 @@ const struct st_tracked_state st_update_vertex_texture = {
    "st_update_vertex_texture",				/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_VERTEX_PROGRAM,				/* st */
+      ST_NEW_VERTEX_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_vertex_textures				/* update */
 };
@@ -498,52 +558,27 @@ const struct st_tracked_state st_update_geometry_texture = {
    "st_update_geometry_texture",			/* name */
    {							/* dirty */
       _NEW_TEXTURE,					/* mesa */
-      ST_NEW_GEOMETRY_PROGRAM,				/* st */
+      ST_NEW_GEOMETRY_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
    update_geometry_textures				/* update */
 };
 
 
-
-static void
-finalize_textures(struct st_context *st)
-{
-   struct gl_context *ctx = st->ctx;
-   struct gl_fragment_program *fprog = ctx->FragmentProgram._Current;
-   const GLboolean prev_missing_textures = st->missing_textures;
-   GLuint su;
-
-   st->missing_textures = GL_FALSE;
-
-   for (su = 0; su < ctx->Const.MaxTextureCoordUnits; su++) {
-      if (fprog->Base.SamplersUsed & (1 << su)) {
-         const GLuint texUnit = fprog->Base.SamplerUnits[su];
-         struct gl_texture_object *texObj
-            = ctx->Texture.Unit[texUnit]._Current;
-
-         if (texObj) {
-            GLboolean retval;
-
-            retval = st_finalize_texture(ctx, st->pipe, texObj);
-            if (!retval) {
-               /* out of mem */
-               st->missing_textures = GL_TRUE;
-               continue;
-            }
-         }
-      }
-   }
-
-   if (prev_missing_textures != st->missing_textures)
-      st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
-}
-
-
-const struct st_tracked_state st_finalize_textures = {
-   "st_finalize_textures",		/* name */
-   {					/* dirty */
-      _NEW_TEXTURE,			/* mesa */
-      0,				/* st */
+const struct st_tracked_state st_update_tessctrl_texture = {
+   "st_update_tessctrl_texture",			/* name */
+   {							/* dirty */
+      _NEW_TEXTURE,					/* mesa */
+      ST_NEW_TESSCTRL_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
    },
-   finalize_textures			/* update */
+   update_tessctrl_textures				/* update */
+};
+
+
+const struct st_tracked_state st_update_tesseval_texture = {
+   "st_update_tesseval_texture",			/* name */
+   {							/* dirty */
+      _NEW_TEXTURE,					/* mesa */
+      ST_NEW_TESSEVAL_PROGRAM | ST_NEW_SAMPLER_VIEWS,	/* st */
+   },
+   update_tesseval_textures				/* update */
 };
diff --git a/src/mesa/state_tracker/st_atom_viewport.c b/src/mesa/state_tracker/st_atom_viewport.c
index 2f62590c4f1..9a692cecade 100644
--- a/src/mesa/state_tracker/st_atom_viewport.c
+++ b/src/mesa/state_tracker/st_atom_viewport.c
@@ -64,7 +64,7 @@ update_viewport( struct st_context *st )
     */
    for (i = 0; i < ctx->Const.MaxViewports; i++)
    {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, i, scale, translate);
 
       st->state.viewport[i].scale[0] = scale[0];
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index c881e194f70..01a96c18264 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -446,8 +446,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    assert(height <= (GLsizei)maxSize);
 
    cso_save_rasterizer(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
+   cso_save_fragment_sampler_views(cso);
    cso_save_viewport(cso);
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
@@ -535,8 +535,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
    /* restore state */
    cso_restore_rasterizer(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_viewport(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
diff --git a/src/mesa/state_tracker/st_cb_blit.c b/src/mesa/state_tracker/st_cb_blit.c
index 6d9371852c5..139690615d6 100644
--- a/src/mesa/state_tracker/st_cb_blit.c
+++ b/src/mesa/state_tracker/st_cb_blit.c
@@ -39,7 +39,7 @@
 #include "st_cb_bitmap.h"
 #include "st_cb_blit.h"
 #include "st_cb_fbo.h"
-#include "st_atom.h"
+#include "st_manager.h"
 
 #include "util/u_format.h"
 
@@ -92,7 +92,7 @@ st_BlitFramebuffer(struct gl_context *ctx,
    } clip;
    struct pipe_blit_info blit;
 
-   st_validate_state(st);
+   st_manager_validate_framebuffers(st);
 
    /* Make sure bitmap rendering has landed in the framebuffers */
    st_flush_bitmap_cache(st);
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index a6a98c83aa6..b372697026b 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -33,6 +33,7 @@
 #include "main/imports.h"
 #include "main/image.h"
 #include "main/bufferobj.h"
+#include "main/blit.h"
 #include "main/format_pack.h"
 #include "main/macros.h"
 #include "main/mtypes.h"
@@ -688,8 +689,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
 
    cso_save_rasterizer(cso);
    cso_save_viewport(cso);
-   cso_save_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_save_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_save_fragment_samplers(cso);
+   cso_save_fragment_sampler_views(cso);
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
@@ -756,6 +757,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* texture sampling state: */
    {
       struct pipe_sampler_state sampler;
+      const struct pipe_sampler_state *states[2] = {&sampler, &sampler};
+
       memset(&sampler, 0, sizeof(sampler));
       sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
       sampler.wrap_t = PIPE_TEX_WRAP_CLAMP;
@@ -765,11 +768,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
       sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
       sampler.normalized_coords = normalized;
 
-      cso_single_sampler(cso, PIPE_SHADER_FRAGMENT, 0, &sampler);
-      if (num_sampler_view > 1) {
-         cso_single_sampler(cso, PIPE_SHADER_FRAGMENT, 1, &sampler);
-      }
-      cso_single_sampler_done(cso, PIPE_SHADER_FRAGMENT);
+      cso_set_samplers(cso, PIPE_SHADER_FRAGMENT,
+                       num_sampler_view > 1 ? 2 : 1, states);
    }
 
    /* viewport state: viewport matching window dims */
@@ -816,8 +816,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* restore state */
    cso_restore_rasterizer(cso);
    cso_restore_viewport(cso);
-   cso_restore_samplers(cso, PIPE_SHADER_FRAGMENT);
-   cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
+   cso_restore_fragment_samplers(cso);
+   cso_restore_fragment_sampler_views(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
    cso_restore_tessctrl_shader(cso);
@@ -1312,31 +1312,6 @@ st_get_color_read_renderbuffer(struct gl_context *ctx)
 }
 
 
-/**
- * \return TRUE if two regions overlap, FALSE otherwise
- */
-static boolean
-regions_overlap(int srcX0, int srcY0,
-                int srcX1, int srcY1,
-                int dstX0, int dstY0,
-                int dstX1, int dstY1)
-{
-   if (MAX2(srcX0, srcX1) < MIN2(dstX0, dstX1))
-      return FALSE; /* src completely left of dst */
-
-   if (MAX2(dstX0, dstX1) < MIN2(srcX0, srcX1))
-      return FALSE; /* dst completely left of src */
-
-   if (MAX2(srcY0, srcY1) < MIN2(dstY0, dstY1))
-      return FALSE; /* src completely above dst */
-
-   if (MAX2(dstY0, dstY1) < MIN2(srcY0, srcY1))
-      return FALSE; /* dst completely above src */
-
-   return TRUE; /* some overlap */
-}
-
-
 /**
  * Try to do a glCopyPixels for simple cases with a blit by calling
  * pipe->blit().
@@ -1420,8 +1395,8 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
       }
 
       if (rbRead != rbDraw ||
-          !regions_overlap(readX, readY, readX + readW, readY + readH,
-                           drawX, drawY, drawX + drawW, drawY + drawH)) {
+          !_mesa_regions_overlap(readX, readY, readX + readW, readY + readH,
+                                 drawX, drawY, drawX + drawW, drawY + drawH)) {
          struct pipe_blit_info blit;
 
          memset(&blit, 0, sizeof(blit));
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 0399eef7204..57075904450 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -511,8 +511,6 @@ st_render_texture(struct gl_context *ctx,
    strb->rtt_layered = att->Layered;
    pipe_resource_reference(&strb->texture, pt);
 
-   pipe_surface_release(pipe, &strb->surface);
-
    st_update_renderbuffer_surface(st, strb);
 
    strb->Base.Format = st_pipe_format_to_mesa_format(pt->format);
diff --git a/src/mesa/state_tracker/st_cb_perfmon.h b/src/mesa/state_tracker/st_cb_perfmon.h
index 13d3627de5d..0b195de47fe 100644
--- a/src/mesa/state_tracker/st_cb_perfmon.h
+++ b/src/mesa/state_tracker/st_cb_perfmon.h
@@ -46,7 +46,7 @@ struct st_perf_counter_object
 /**
  * Cast wrapper
  */
-static INLINE struct st_perf_monitor_object *
+static inline struct st_perf_monitor_object *
 st_perf_monitor_object(struct gl_perf_monitor_object *q)
 {
    return (struct st_perf_monitor_object *)q;
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 6aa7d5796d9..3029909d12d 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -68,6 +68,12 @@ st_bind_program(struct gl_context *ctx, GLenum target, struct gl_program *prog)
    case GL_GEOMETRY_PROGRAM_NV:
       st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
+      break;
    }
 }
 
@@ -84,6 +90,8 @@ st_use_program(struct gl_context *ctx, struct gl_shader_program *shProg)
    st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
    st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
    st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
+   st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+   st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
 }
 
 
@@ -110,6 +118,16 @@ st_new_program(struct gl_context *ctx, GLenum target, GLuint id)
       return _mesa_init_geometry_program(ctx, &prog->Base, target, id);
    }
 
+   case GL_TESS_CONTROL_PROGRAM_NV: {
+      struct st_tessctrl_program *prog = ST_CALLOC_STRUCT(st_tessctrl_program);
+      return _mesa_init_tess_ctrl_program(ctx, &prog->Base, target, id);
+   }
+
+   case GL_TESS_EVALUATION_PROGRAM_NV: {
+      struct st_tesseval_program *prog = ST_CALLOC_STRUCT(st_tesseval_program);
+      return _mesa_init_tess_eval_program(ctx, &prog->Base, target, id);
+   }
+
    default:
       assert(0);
       return NULL;
@@ -157,6 +175,28 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
             free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi);
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         struct st_tessctrl_program *sttcp =
+            (struct st_tessctrl_program *) prog;
+
+         st_release_tcp_variants(st, sttcp);
+
+         if (sttcp->glsl_to_tgsi)
+            free_glsl_to_tgsi_visitor(sttcp->glsl_to_tgsi);
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         struct st_tesseval_program *sttep =
+            (struct st_tesseval_program *) prog;
+
+         st_release_tep_variants(st, sttep);
+
+         if (sttep->glsl_to_tgsi)
+            free_glsl_to_tgsi_visitor(sttep->glsl_to_tgsi);
+      }
+      break;
    default:
       assert(0); /* problem */
    }
@@ -214,6 +254,24 @@ st_program_string_notify( struct gl_context *ctx,
       if (st->vp == stvp)
 	 st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
    }
+   else if (target == GL_TESS_CONTROL_PROGRAM_NV) {
+      struct st_tessctrl_program *sttcp =
+         (struct st_tessctrl_program *) prog;
+
+      st_release_tcp_variants(st, sttcp);
+
+      if (st->tcp == sttcp)
+         st->dirty.st |= ST_NEW_TESSCTRL_PROGRAM;
+   }
+   else if (target == GL_TESS_EVALUATION_PROGRAM_NV) {
+      struct st_tesseval_program *sttep =
+         (struct st_tesseval_program *) prog;
+
+      st_release_tep_variants(st, sttep);
+
+      if (st->tep == sttep)
+         st->dirty.st |= ST_NEW_TESSEVAL_PROGRAM;
+   }
 
    if (ST_DEBUG & DEBUG_PRECOMPILE)
       st_precompile_shader_variant(st, prog);
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index 272cbb91d52..b9997dacfd2 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -254,7 +254,7 @@ st_RasterPos(struct gl_context *ctx, const GLfloat v[4])
     * st_feedback_draw_vbo doesn't check for that flag. */
    ctx->Array._DrawArrays = rs->arrays;
    st_feedback_draw_vbo(ctx, &rs->prim, 1, NULL, GL_TRUE, 0, 1,
-                        NULL, NULL);
+                        NULL, 0, NULL);
    ctx->Array._DrawArrays = saved_arrays;
 
    /* restore draw's rasterization stage depending on rendermode */
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index d95a608d32e..18ea43fa71a 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -43,6 +43,30 @@
 #include "state_tracker/st_format.h"
 #include "state_tracker/st_texture.h"
 
+static boolean
+needs_integer_signed_unsigned_conversion(const struct gl_context *ctx,
+                                         GLenum format, GLenum type)
+{
+   struct gl_renderbuffer *rb =
+      _mesa_get_read_renderbuffer_for_format(ctx, format);
+
+   assert(rb);
+
+   GLenum srcType = _mesa_get_format_datatype(rb->Format);
+
+    if ((srcType == GL_INT &&
+        (type == GL_UNSIGNED_INT ||
+         type == GL_UNSIGNED_SHORT ||
+         type == GL_UNSIGNED_BYTE)) ||
+       (srcType == GL_UNSIGNED_INT &&
+        (type == GL_INT ||
+         type == GL_SHORT ||
+         type == GL_BYTE))) {
+      return TRUE;
+   }
+
+   return FALSE;
+}
 
 /**
  * This uses a blit to copy the read buffer to a texture format which matches
@@ -123,6 +147,10 @@ st_readpixels(struct gl_context *ctx, GLint x, GLint y,
       goto fallback;
    }
 
+   if (needs_integer_signed_unsigned_conversion(ctx, format, type)) {
+      goto fallback;
+   }
+
    /* Convert the source format to what is expected by ReadPixels
     * and see if it's supported. */
    src_format = util_format_linear(src->format);
diff --git a/src/mesa/state_tracker/st_cb_syncobj.c b/src/mesa/state_tracker/st_cb_syncobj.c
index 6d875b851a2..ec2687fba53 100644
--- a/src/mesa/state_tracker/st_cb_syncobj.c
+++ b/src/mesa/state_tracker/st_cb_syncobj.c
@@ -81,7 +81,13 @@ static void st_check_sync(struct gl_context *ctx, struct gl_sync_object *obj)
    struct pipe_screen *screen = st_context(ctx)->pipe->screen;
    struct st_sync_object *so = (struct st_sync_object*)obj;
 
-   if (so->fence && screen->fence_signalled(screen, so->fence)) {
+   /* If the fence doesn't exist, assume it's signalled. */
+   if (!so->fence) {
+      so->b.StatusFlag = GL_TRUE;
+      return;
+   }
+
+   if (screen->fence_finish(screen, so->fence, 0)) {
       screen->fence_reference(screen, &so->fence, NULL);
       so->b.StatusFlag = GL_TRUE;
    }
@@ -94,6 +100,12 @@ static void st_client_wait_sync(struct gl_context *ctx,
    struct pipe_screen *screen = st_context(ctx)->pipe->screen;
    struct st_sync_object *so = (struct st_sync_object*)obj;
 
+   /* If the fence doesn't exist, assume it's signalled. */
+   if (!so->fence) {
+      so->b.StatusFlag = GL_TRUE;
+      return;
+   }
+
    /* We don't care about GL_SYNC_FLUSH_COMMANDS_BIT, because flush is
     * already called when creating a fence. */
 
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 7ea3846fff1..715d69c0c68 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -896,7 +896,7 @@ st_CompressedTexImage(struct gl_context *ctx, GLuint dims,
 
 
 /**
- * Called via ctx->Driver.GetTexImage()
+ * Called via ctx->Driver.GetTexSubImage()
  *
  * This uses a blit to copy the texture to a texture format which matches
  * the format and type combo and then a fast read-back is done using memcpy.
@@ -910,16 +910,15 @@ st_CompressedTexImage(struct gl_context *ctx, GLuint dims,
  *       we do here should be free in such cases.
  */
 static void
-st_GetTexImage(struct gl_context * ctx,
-               GLenum format, GLenum type, GLvoid * pixels,
-               struct gl_texture_image *texImage)
+st_GetTexSubImage(struct gl_context * ctx,
+                  GLint xoffset, GLint yoffset, GLint zoffset,
+                  GLsizei width, GLsizei height, GLint depth,
+                  GLenum format, GLenum type, GLvoid * pixels,
+                  struct gl_texture_image *texImage)
 {
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
    struct pipe_screen *screen = pipe->screen;
-   GLuint width = texImage->Width;
-   GLuint height = texImage->Height;
-   GLuint depth = texImage->Depth;
    struct st_texture_image *stImage = st_texture_image(texImage);
    struct st_texture_object *stObj = st_texture_object(texImage->TexObject);
    struct pipe_resource *src = stObj->pt;
@@ -1054,7 +1053,7 @@ st_GetTexImage(struct gl_context * ctx,
       }
    }
 
-   /* create the destination texture */
+   /* create the destination texture of size (width X height X depth) */
    memset(&dst_templ, 0, sizeof(dst_templ));
    dst_templ.target = pipe_target;
    dst_templ.format = dst_format;
@@ -1076,6 +1075,10 @@ st_GetTexImage(struct gl_context * ctx,
       height = 1;
    }
 
+   assert(texImage->Face == 0 ||
+          texImage->TexObject->MinLayer == 0 ||
+          zoffset == 0);
+
    memset(&blit, 0, sizeof(blit));
    blit.src.resource = src;
    blit.src.level = texImage->Level + texImage->TexObject->MinLevel;
@@ -1083,9 +1086,11 @@ st_GetTexImage(struct gl_context * ctx,
    blit.dst.resource = dst;
    blit.dst.level = 0;
    blit.dst.format = dst->format;
-   blit.src.box.x = blit.dst.box.x = 0;
-   blit.src.box.y = blit.dst.box.y = 0;
-   blit.src.box.z = texImage->Face + texImage->TexObject->MinLayer;
+   blit.src.box.x = xoffset;
+   blit.dst.box.x = 0;
+   blit.src.box.y = yoffset;
+   blit.dst.box.y = 0;
+   blit.src.box.z = texImage->Face + texImage->TexObject->MinLayer + zoffset;
    blit.dst.box.z = 0;
    blit.src.box.width = blit.dst.box.width = width;
    blit.src.box.height = blit.dst.box.height = height;
@@ -1206,7 +1211,9 @@ end:
 
 fallback:
    if (!done) {
-      _mesa_GetTexImage_sw(ctx, format, type, pixels, texImage);
+      _mesa_GetTexSubImage_sw(ctx, xoffset, yoffset, zoffset,
+                              width, height, depth,
+                              format, type, pixels, texImage);
    }
 }
 
@@ -1876,11 +1883,11 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->CopyTexSubImage = st_CopyTexSubImage;
    functions->GenerateMipmap = st_generate_mipmap;
 
-   functions->GetTexImage = st_GetTexImage;
+   functions->GetTexSubImage = st_GetTexSubImage;
 
    /* compressed texture functions */
    functions->CompressedTexImage = st_CompressedTexImage;
-   functions->GetCompressedTexImage = _mesa_GetCompressedTexImage_sw;
+   functions->GetCompressedTexSubImage = _mesa_GetCompressedTexSubImage_sw;
 
    functions->NewTextureObject = st_NewTextureObject;
    functions->NewTextureImage = st_NewTextureImage;
diff --git a/src/mesa/state_tracker/st_cb_xformfb.c b/src/mesa/state_tracker/st_cb_xformfb.c
index 07c118e227b..0c01cd5ab78 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.c
+++ b/src/mesa/state_tracker/st_cb_xformfb.c
@@ -54,9 +54,9 @@ struct st_transform_feedback_object {
    struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
 
    /* This encapsulates the count that can be used as a source for draw_vbo.
-    * It contains a stream output target from the last call of
-    * EndTransformFeedback. */
-   struct pipe_stream_output_target *draw_count;
+    * It contains stream output targets from the last call of
+    * EndTransformFeedback for each stream. */
+   struct pipe_stream_output_target *draw_count[MAX_VERTEX_STREAMS];
 };
 
 static inline struct st_transform_feedback_object *
@@ -88,7 +88,8 @@ st_delete_transform_feedback(struct gl_context *ctx,
          st_transform_feedback_object(obj);
    unsigned i;
 
-   pipe_so_target_reference(&sobj->draw_count, NULL);
+   for (i = 0; i < ARRAY_SIZE(sobj->draw_count); i++)
+      pipe_so_target_reference(&sobj->draw_count[i], NULL);
 
    /* Unreference targets. */
    for (i = 0; i < sobj->num_targets; i++) {
@@ -123,9 +124,12 @@ st_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
       struct st_buffer_object *bo = st_buffer_object(sobj->base.Buffers[i]);
 
       if (bo && bo->buffer) {
+         unsigned stream =
+            obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+
          /* Check whether we need to recreate the target. */
          if (!sobj->targets[i] ||
-             sobj->targets[i] == sobj->draw_count ||
+             sobj->targets[i] == sobj->draw_count[stream] ||
              sobj->targets[i]->buffer != bo->buffer ||
              sobj->targets[i]->buffer_offset != sobj->base.Offset[i] ||
              sobj->targets[i]->buffer_size != sobj->base.Size[i]) {
@@ -178,24 +182,6 @@ st_resume_transform_feedback(struct gl_context *ctx,
 }
 
 
-static struct pipe_stream_output_target *
-st_transform_feedback_get_draw_target(struct gl_transform_feedback_object *obj)
-{
-   struct st_transform_feedback_object *sobj =
-         st_transform_feedback_object(obj);
-   unsigned i;
-
-   for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
-      if (sobj->targets[i]) {
-         return sobj->targets[i];
-      }
-   }
-
-   assert(0);
-   return NULL;
-}
-
-
 static void
 st_end_transform_feedback(struct gl_context *ctx,
                           struct gl_transform_feedback_object *obj)
@@ -203,22 +189,41 @@ st_end_transform_feedback(struct gl_context *ctx,
    struct st_context *st = st_context(ctx);
    struct st_transform_feedback_object *sobj =
          st_transform_feedback_object(obj);
+   unsigned i;
 
    cso_set_stream_outputs(st->cso_context, 0, NULL, NULL);
 
-   pipe_so_target_reference(&sobj->draw_count,
-                            st_transform_feedback_get_draw_target(obj));
+   /* The next call to glDrawTransformFeedbackStream should use the vertex
+    * count from the last call to glEndTransformFeedback.
+    * Therefore, save the targets for each stream.
+    *
+    * NULL means the vertex counter is 0 (initial state).
+    */
+   for (i = 0; i < ARRAY_SIZE(sobj->draw_count); i++)
+      pipe_so_target_reference(&sobj->draw_count[i], NULL);
+
+   for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
+      unsigned stream =
+         obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+
+      /* Is it not bound or already set for this stream? */
+      if (!sobj->targets[i] || sobj->draw_count[stream])
+         continue;
+
+      pipe_so_target_reference(&sobj->draw_count[stream], sobj->targets[i]);
+   }
 }
 
 
-void
+bool
 st_transform_feedback_draw_init(struct gl_transform_feedback_object *obj,
-                                struct pipe_draw_info *out)
+                                unsigned stream, struct pipe_draw_info *out)
 {
    struct st_transform_feedback_object *sobj =
          st_transform_feedback_object(obj);
 
-   out->count_from_stream_output = sobj->draw_count;
+   out->count_from_stream_output = sobj->draw_count[stream];
+   return out->count_from_stream_output != NULL;
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_xformfb.h b/src/mesa/state_tracker/st_cb_xformfb.h
index 998c418257b..444d11842c5 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.h
+++ b/src/mesa/state_tracker/st_cb_xformfb.h
@@ -38,9 +38,9 @@ struct pipe_draw_info;
 extern void
 st_init_xformfb_functions(struct dd_function_table *functions);
 
-extern void
+extern bool
 st_transform_feedback_draw_init(struct gl_transform_feedback_object *obj,
-                                struct pipe_draw_info *out);
+                                unsigned stream, struct pipe_draw_info *out);
 
 
 #endif /* ST_CB_XFORMFB_H */
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index ed9ed0f1b6c..72c23cad4bc 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -287,6 +287,11 @@ st_create_context_priv( struct gl_context *ctx, struct pipe_context *pipe,
    /* For vertex shaders, make sure not to emit saturate when SM 3.0 is not supported */
    ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoSat = !st->has_shader_model3;
 
+   if (!ctx->Extensions.ARB_gpu_shader5) {
+      for (i = 0; i < MESA_SHADER_STAGES; i++)
+         ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectSampler = true;
+   }
+
    _mesa_compute_version(ctx);
 
    if (ctx->Version == 0) {
@@ -308,6 +313,8 @@ static void st_init_driver_flags(struct gl_driver_flags *f)
    f->NewArray = ST_NEW_VERTEX_ARRAYS;
    f->NewRasterizerDiscard = ST_NEW_RASTERIZER;
    f->NewUniformBuffer = ST_NEW_UNIFORM_BUFFER;
+   f->NewDefaultTessLevels = ST_NEW_TESS_STATE;
+   f->NewTextureBuffer = ST_NEW_SAMPLER_VIEWS;
 }
 
 struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
@@ -369,6 +376,8 @@ void st_destroy_context( struct st_context *st )
    st_reference_fragprog(st, &st->fp, NULL);
    st_reference_geomprog(st, &st->gp, NULL);
    st_reference_vertprog(st, &st->vp, NULL);
+   st_reference_tesscprog(st, &st->tcp, NULL);
+   st_reference_tesseprog(st, &st->tep, NULL);
 
    /* release framebuffer surfaces */
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index dac5a4b9006..81d5480431a 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -53,11 +53,14 @@ struct u_upload_mgr;
 #define ST_NEW_FRAGMENT_PROGRAM        (1 << 1)
 #define ST_NEW_VERTEX_PROGRAM          (1 << 2)
 #define ST_NEW_FRAMEBUFFER             (1 << 3)
-/* gap, re-use it */
+#define ST_NEW_TESS_STATE              (1 << 4)
 #define ST_NEW_GEOMETRY_PROGRAM        (1 << 5)
 #define ST_NEW_VERTEX_ARRAYS           (1 << 6)
 #define ST_NEW_RASTERIZER              (1 << 7)
 #define ST_NEW_UNIFORM_BUFFER          (1 << 8)
+#define ST_NEW_TESSCTRL_PROGRAM        (1 << 9)
+#define ST_NEW_TESSEVAL_PROGRAM        (1 << 10)
+#define ST_NEW_SAMPLER_VIEWS           (1 << 11)
 
 
 struct st_state_flags {
@@ -137,7 +140,6 @@ struct st_context
 
    struct st_state_flags dirty;
 
-   GLboolean missing_textures;
    GLboolean vertdata_edgeflags;
    GLboolean edgeflag_culls_prims;
 
@@ -147,10 +149,14 @@ struct st_context
    struct st_vertex_program *vp;    /**< Currently bound vertex program */
    struct st_fragment_program *fp;  /**< Currently bound fragment program */
    struct st_geometry_program *gp;  /**< Currently bound geometry program */
+   struct st_tessctrl_program *tcp; /**< Currently bound tess control program */
+   struct st_tesseval_program *tep; /**< Currently bound tess eval program */
 
    struct st_vp_variant *vp_variant;
    struct st_fp_variant *fp_variant;
    struct st_gp_variant *gp_variant;
+   struct st_tcp_variant *tcp_variant;
+   struct st_tep_variant *tep_variant;
 
    struct gl_texture_object *default_texture;
 
@@ -272,6 +278,29 @@ st_fb_orientation(const struct gl_framebuffer *fb)
 }
 
 
+static inline unsigned
+st_shader_stage_to_ptarget(gl_shader_stage stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+      return PIPE_SHADER_VERTEX;
+   case MESA_SHADER_FRAGMENT:
+      return PIPE_SHADER_FRAGMENT;
+   case MESA_SHADER_GEOMETRY:
+      return PIPE_SHADER_GEOMETRY;
+   case MESA_SHADER_TESS_CTRL:
+      return PIPE_SHADER_TESS_CTRL;
+   case MESA_SHADER_TESS_EVAL:
+      return PIPE_SHADER_TESS_EVAL;
+   case MESA_SHADER_COMPUTE:
+      return PIPE_SHADER_COMPUTE;
+   }
+
+   assert(!"should not be reached");
+   return PIPE_SHADER_VERTEX;
+}
+
+
 /** clear-alloc a struct-sized object, with casting */
 #define ST_CALLOC_STRUCT(T)   (struct T *) calloc(1, sizeof(struct T))
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 8b43582c14b..957fcfd410e 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -164,6 +164,7 @@ translate_prim(const struct gl_context *ctx, unsigned prim)
    STATIC_ASSERT(GL_POINTS == PIPE_PRIM_POINTS);
    STATIC_ASSERT(GL_QUADS == PIPE_PRIM_QUADS);
    STATIC_ASSERT(GL_TRIANGLE_STRIP_ADJACENCY == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY);
+   STATIC_ASSERT(GL_PATCHES == PIPE_PRIM_PATCHES);
 
    return prim;
 }
@@ -183,6 +184,7 @@ st_draw_vbo(struct gl_context *ctx,
             GLuint min_index,
             GLuint max_index,
             struct gl_transform_feedback_object *tfb_vertcount,
+            unsigned stream,
             struct gl_buffer_object *indirect)
 {
    struct st_context *st = st_context(ctx);
@@ -241,7 +243,8 @@ st_draw_vbo(struct gl_context *ctx,
       /* Transform feedback drawing is always non-indexed. */
       /* Set info.count_from_stream_output. */
       if (tfb_vertcount) {
-         st_transform_feedback_draw_init(tfb_vertcount, &info);
+         if (!st_transform_feedback_draw_init(tfb_vertcount, stream, &info))
+            return;
       }
    }
 
@@ -260,6 +263,7 @@ st_draw_vbo(struct gl_context *ctx,
       info.count = prims[i].count;
       info.start_instance = prims[i].base_instance;
       info.instance_count = prims[i].num_instances;
+      info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
       info.index_bias = prims[i].basevertex;
       if (!ib) {
          info.min_index = info.start;
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index 780d4bde713..a973c8a4a5d 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -56,6 +56,7 @@ st_draw_vbo(struct gl_context *ctx,
             GLuint min_index,
             GLuint max_index,
             struct gl_transform_feedback_object *tfb_vertcount,
+            unsigned stream,
             struct gl_buffer_object *indirect);
 
 extern void
@@ -67,6 +68,7 @@ st_feedback_draw_vbo(struct gl_context *ctx,
                      GLuint min_index,
                      GLuint max_index,
                      struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
                      struct gl_buffer_object *indirect);
 
 /**
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index 177f6b5aefa..88c10a8f150 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -117,6 +117,7 @@ st_feedback_draw_vbo(struct gl_context *ctx,
                      GLuint min_index,
                      GLuint max_index,
                      struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
                      struct gl_buffer_object *indirect)
 {
    struct st_context *st = st_context(ctx);
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index b1057f3eadd..17f572f80fb 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -165,6 +165,14 @@ void st_init_limits(struct pipe_screen *screen,
          pc = &c->Program[MESA_SHADER_GEOMETRY];
          options = &c->ShaderCompilerOptions[MESA_SHADER_GEOMETRY];
          break;
+      case PIPE_SHADER_TESS_CTRL:
+         pc = &c->Program[MESA_SHADER_TESS_CTRL];
+         options = &c->ShaderCompilerOptions[MESA_SHADER_TESS_CTRL];
+         break;
+      case PIPE_SHADER_TESS_EVAL:
+         pc = &c->Program[MESA_SHADER_TESS_EVAL];
+         options = &c->ShaderCompilerOptions[MESA_SHADER_TESS_EVAL];
+         break;
       default:
          /* compute shader, etc. */
          continue;
@@ -245,8 +253,12 @@ void st_init_limits(struct pipe_screen *screen,
       options->LowerClipDistance = true;
    }
 
+   c->LowerTessLevel = true;
+
    c->MaxCombinedTextureImageUnits =
          _min(c->Program[MESA_SHADER_VERTEX].MaxTextureImageUnits +
+              c->Program[MESA_SHADER_TESS_CTRL].MaxTextureImageUnits +
+              c->Program[MESA_SHADER_TESS_EVAL].MaxTextureImageUnits +
               c->Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits +
               c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
               MAX_COMBINED_TEXTURE_IMAGE_UNITS);
@@ -266,6 +278,9 @@ void st_init_limits(struct pipe_screen *screen,
    c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING);
    c->MaxGeometryOutputVertices = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
    c->MaxGeometryTotalOutputComponents = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS);
+   c->MaxTessPatchComponents =
+      MAX2(screen->get_param(screen, PIPE_CAP_MAX_SHADER_PATCH_VARYINGS),
+           MAX_VARYING) * 4;
 
    c->MinProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MIN_TEXEL_OFFSET);
    c->MaxProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MAX_TEXEL_OFFSET);
@@ -301,6 +316,8 @@ void st_init_limits(struct pipe_screen *screen,
          screen->get_param(screen, PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT);
       c->MaxCombinedUniformBlocks = c->MaxUniformBufferBindings =
          c->Program[MESA_SHADER_VERTEX].MaxUniformBlocks +
+         c->Program[MESA_SHADER_TESS_CTRL].MaxUniformBlocks +
+         c->Program[MESA_SHADER_TESS_EVAL].MaxUniformBlocks +
          c->Program[MESA_SHADER_GEOMETRY].MaxUniformBlocks +
          c->Program[MESA_SHADER_FRAGMENT].MaxUniformBlocks;
       assert(c->MaxCombinedUniformBlocks <= MAX_COMBINED_UNIFORM_BUFFERS);
@@ -417,12 +434,14 @@ void st_init_extensions(struct pipe_screen *screen,
 
    static const struct st_extension_cap_mapping cap_mapping[] = {
       { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
-      { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT },
+      { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
+      { o(ARB_color_buffer_float),           PIPE_CAP_VERTEX_COLOR_UNCLAMPED           },
       { o(ARB_depth_clamp),                  PIPE_CAP_DEPTH_CLIP_DISABLE               },
       { o(ARB_depth_texture),                PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_draw_buffers_blend),           PIPE_CAP_INDEP_BLEND_FUNC                 },
       { o(ARB_draw_instanced),               PIPE_CAP_TGSI_INSTANCEID                  },
       { o(ARB_fragment_program_shadow),      PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_framebuffer_object),           PIPE_CAP_MIXED_FRAMEBUFFER_SIZES          },
       { o(ARB_instanced_arrays),             PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR  },
       { o(ARB_occlusion_query),              PIPE_CAP_OCCLUSION_QUERY                  },
       { o(ARB_occlusion_query2),             PIPE_CAP_OCCLUSION_QUERY                  },
@@ -432,6 +451,8 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_shader_stencil_export),        PIPE_CAP_SHADER_STENCIL_EXPORT            },
       { o(ARB_shader_texture_lod),           PIPE_CAP_SM3                              },
       { o(ARB_shadow),                       PIPE_CAP_TEXTURE_SHADOW_MAP               },
+      { o(ARB_texture_buffer_object),        PIPE_CAP_TEXTURE_BUFFER_OBJECTS           },
+      { o(ARB_texture_gather),               PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS    },
       { o(ARB_texture_mirror_clamp_to_edge), PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
       { o(ARB_texture_non_power_of_two),     PIPE_CAP_NPOT_TEXTURES                    },
       { o(ARB_timer_query),                  PIPE_CAP_QUERY_TIMESTAMP                  },
@@ -452,11 +473,14 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ATI_separate_stencil),             PIPE_CAP_TWO_SIDED_STENCIL                },
       { o(ATI_texture_mirror_once),          PIPE_CAP_TEXTURE_MIRROR_CLAMP             },
       { o(NV_conditional_render),            PIPE_CAP_CONDITIONAL_RENDER               },
+      { o(NV_primitive_restart),             PIPE_CAP_PRIMITIVE_RESTART                },
       { o(NV_texture_barrier),               PIPE_CAP_TEXTURE_BARRIER                  },
       /* GL_NV_point_sprite is not supported by gallium because we don't
        * support the GL_POINT_SPRITE_R_MODE_NV option. */
 
       { o(OES_standard_derivatives),         PIPE_CAP_SM3                              },
+      { o(OES_texture_float_linear),         PIPE_CAP_TEXTURE_FLOAT_LINEAR             },
+      { o(OES_texture_half_float_linear),    PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR        },
       { o(ARB_texture_cube_map_array),       PIPE_CAP_CUBE_MAP_ARRAY                   },
       { o(ARB_texture_multisample),          PIPE_CAP_TEXTURE_MULTISAMPLE              },
       { o(ARB_texture_query_lod),            PIPE_CAP_TEXTURE_QUERY_LOD                },
@@ -467,6 +491,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_texture_view),                 PIPE_CAP_SAMPLER_VIEW_TARGET              },
       { o(ARB_clip_control),                 PIPE_CAP_CLIP_HALFZ                       },
       { o(EXT_polygon_offset_clamp),         PIPE_CAP_POLYGON_OFFSET_CLAMP             },
+      { o(EXT_depth_bounds_test),            PIPE_CAP_DEPTH_BOUNDS_TEST                },
    };
 
    /* Required: render target and sampler support */
@@ -475,6 +500,12 @@ void st_init_extensions(struct pipe_screen *screen,
         { PIPE_FORMAT_R32G32B32A32_FLOAT,
           PIPE_FORMAT_R16G16B16A16_FLOAT } },
 
+      { { o(OES_texture_float) },
+        { PIPE_FORMAT_R32G32B32A32_FLOAT } },
+
+      { { o(OES_texture_half_float) },
+        { PIPE_FORMAT_R16G16B16A16_FLOAT } },
+
       { { o(ARB_texture_rgb10_a2ui) },
         { PIPE_FORMAT_R10G10B10A2_UINT,
           PIPE_FORMAT_B10G10R10A2_UINT },
@@ -556,7 +587,8 @@ void st_init_extensions(struct pipe_screen *screen,
           PIPE_FORMAT_R8G8B8A8_UNORM },
         GL_TRUE }, /* at least one format must be supported */
 
-      { { o(ARB_stencil_texturing) },
+      { { o(ARB_stencil_texturing),
+          o(ARB_texture_stencil8) },
         { PIPE_FORMAT_X24S8_UINT,
           PIPE_FORMAT_S8X24_UINT },
         GL_TRUE }, /* at least one format must be supported */
@@ -650,9 +682,6 @@ void st_init_extensions(struct pipe_screen *screen,
                           ARRAY_SIZE(vertex_mapping), PIPE_BUFFER,
                           PIPE_BIND_VERTEX_BUFFER);
 
-   if (extensions->ARB_stencil_texturing)
-      extensions->ARB_texture_stencil8 = GL_TRUE;
-
    /* Figure out GLSL support. */
    glsl_feature_level = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
 
@@ -693,6 +722,7 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->OES_depth_texture_cube_map = GL_TRUE;
       extensions->ARB_shading_language_420pack = GL_TRUE;
       extensions->ARB_texture_query_levels = GL_TRUE;
+      extensions->ARB_shader_subroutine = GL_TRUE;
 
       if (!options->disable_shader_bit_encoding) {
          extensions->ARB_shader_bit_encoding = GL_TRUE;
@@ -723,20 +753,9 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->ANGLE_texture_compression_dxt = GL_FALSE;
    }
 
-   if (screen->get_shader_param(screen, PIPE_SHADER_GEOMETRY,
+   if (screen->get_shader_param(screen, PIPE_SHADER_TESS_CTRL,
                                 PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
-#if 0 /* XXX re-enable when GLSL compiler again supports geometry shaders */
-      extensions->ARB_geometry_shader4 = GL_TRUE;
-#endif
-   }
-
-   if (screen->get_param(screen, PIPE_CAP_PRIMITIVE_RESTART)) {
-      extensions->NV_primitive_restart = GL_TRUE;
-   }
-
-   /* ARB_color_buffer_float. */
-   if (screen->get_param(screen, PIPE_CAP_VERTEX_COLOR_UNCLAMPED)) {
-      extensions->ARB_color_buffer_float = GL_TRUE;
+      extensions->ARB_tessellation_shader = GL_TRUE;
    }
 
    if (screen->fence_finish) {
@@ -823,9 +842,7 @@ void st_init_extensions(struct pipe_screen *screen,
    consts->MinMapBufferAlignment =
       screen->get_param(screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
 
-   if (screen->get_param(screen, PIPE_CAP_TEXTURE_BUFFER_OBJECTS)) {
-      extensions->ARB_texture_buffer_object = GL_TRUE;
-
+   if (extensions->ARB_texture_buffer_object) {
       consts->MaxTextureBufferSize =
          _min(screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE),
               (1u << 31) - 1);
@@ -840,10 +857,6 @@ void st_init_extensions(struct pipe_screen *screen,
                              PIPE_BIND_SAMPLER_VIEW);
    }
 
-   if (screen->get_param(screen, PIPE_CAP_MIXED_FRAMEBUFFER_SIZES)) {
-      extensions->ARB_framebuffer_object = GL_TRUE;
-   }
-
    /* Unpacking a varying in the fragment shader costs 1 texture indirection.
     * If the number of available texture indirections is very limited, then we
     * prefer to disable varying packing rather than run the risk of varying
@@ -868,9 +881,6 @@ void st_init_extensions(struct pipe_screen *screen,
          extensions->AMD_vertex_shader_viewport_index = GL_TRUE;
    }
 
-   if (consts->MaxProgramTextureGatherComponents > 0)
-      extensions->ARB_texture_gather = GL_TRUE;
-
    /* GL_ARB_ES3_compatibility.
     *
     * Assume that ES3 is supported if GLSL 3.30 is supported.
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 25e30c7deb2..6f007273c73 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -158,9 +158,12 @@ public:
    {
       this->file = file;
       this->index = index;
+      this->index2D = 0;
       this->writemask = writemask;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->type = type;
       this->array_id = 0;
    }
@@ -169,9 +172,12 @@ public:
    {
       this->file = file;
       this->index = 0;
+      this->index2D = 0;
       this->writemask = writemask;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->type = type;
       this->array_id = 0;
    }
@@ -181,9 +187,12 @@ public:
       this->type = GLSL_TYPE_ERROR;
       this->file = PROGRAM_UNDEFINED;
       this->index = 0;
+      this->index2D = 0;
       this->writemask = 0;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->reladdr2 = NULL;
+      this->has_index2 = false;
       this->array_id = 0;
    }
 
@@ -191,11 +200,14 @@ public:
 
    gl_register_file file; /**< PROGRAM_* from Mesa */
    int index; /**< temporary index, VERT_ATTRIB_*, VARYING_SLOT_*, etc. */
+   int index2D;
    int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
    GLuint cond_mask:4;
    int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
    /** Register index should be offset by the integer in this reg. */
    st_src_reg *reladdr;
+   st_src_reg *reladdr2;
+   bool has_index2;
    unsigned array_id;
 };
 
@@ -207,9 +219,9 @@ st_src_reg::st_src_reg(st_dst_reg reg)
    this->swizzle = SWIZZLE_XYZW;
    this->negate = 0;
    this->reladdr = reg.reladdr;
-   this->index2D = 0;
-   this->reladdr2 = NULL;
-   this->has_index2 = false;
+   this->index2D = reg.index2D;
+   this->reladdr2 = reg.reladdr2;
+   this->has_index2 = reg.has_index2;
    this->double_reg2 = false;
    this->array_id = reg.array_id;
 }
@@ -222,6 +234,9 @@ st_dst_reg::st_dst_reg(st_src_reg reg)
    this->writemask = WRITEMASK_XYZW;
    this->cond_mask = COND_TR;
    this->reladdr = reg.reladdr;
+   this->index2D = reg.index2D;
+   this->reladdr2 = reg.reladdr2;
+   this->has_index2 = reg.has_index2;
    this->array_id = reg.array_id;
 }
 
@@ -551,8 +566,8 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
     * reg directly for one of the regs, and preload the other reladdr
     * sources into temps.
     */
-   num_reladdr += dst.reladdr != NULL;
-   num_reladdr += dst1.reladdr != NULL;
+   num_reladdr += dst.reladdr != NULL || dst.reladdr2;
+   num_reladdr += dst1.reladdr != NULL || dst1.reladdr2;
    num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
    num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
    num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
@@ -563,8 +578,11 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
    reladdr_to_temp(ir, &src1, &num_reladdr);
    reladdr_to_temp(ir, &src0, &num_reladdr);
 
-   if (dst.reladdr) {
-      emit_arl(ir, address_reg, *dst.reladdr);
+   if (dst.reladdr || dst.reladdr2) {
+      if (dst.reladdr)
+         emit_arl(ir, address_reg, *dst.reladdr);
+      if (dst.reladdr2)
+         emit_arl(ir, address_reg2, *dst.reladdr2);
       num_reladdr--;
    }
    if (dst1.reladdr) {
@@ -590,7 +608,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
    inst->function = NULL;
 
    /* Update indirect addressing status used by TGSI */
-   if (dst.reladdr) {
+   if (dst.reladdr || dst.reladdr2) {
       switch(dst.file) {
       case PROGRAM_STATE_VAR:
       case PROGRAM_CONSTANT:
@@ -797,7 +815,7 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
    case TGSI_OPCODE_##c: \
       if (type == GLSL_TYPE_DOUBLE) \
          op = TGSI_OPCODE_##d; \
-      else if (type == GLSL_TYPE_INT)       \
+      else if (type == GLSL_TYPE_INT || type == GLSL_TYPE_SUBROUTINE)       \
          op = TGSI_OPCODE_##i; \
       else if (type == GLSL_TYPE_UINT) \
          op = TGSI_OPCODE_##u; \
@@ -1090,6 +1108,7 @@ type_size(const struct glsl_type *type)
       return size;
    case GLSL_TYPE_SAMPLER:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_SUBROUTINE:
       /* Samplers take up one slot in UNIFORMS[], but they're baked in
        * at link time.
        */
@@ -1470,6 +1489,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
          result_src = op[0];
       }
       break;
+   case ir_unop_subroutine_to_int:
+      emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      break;
    case ir_unop_abs:
       emit_asm(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
       break;
@@ -2243,7 +2265,10 @@ is_inout_array(unsigned stage, ir_variable *var, bool *is_2d)
 
    *is_2d = false;
 
-   if (stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) {
+   if (((stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) ||
+        (stage == MESA_SHADER_TESS_EVAL && var->data.mode == ir_var_shader_in) ||
+        stage == MESA_SHADER_TESS_CTRL) &&
+       !var->data.patch) {
       if (!var->type->is_array())
          return false; /* a system value probably */
 
@@ -2355,7 +2380,8 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 
 static void
 shrink_array_declarations(struct array_decl *arrays, unsigned count,
-                          GLbitfield64 usage_mask)
+                          GLbitfield64 usage_mask,
+                          GLbitfield patch_usage_mask)
 {
    unsigned i, j;
 
@@ -2367,8 +2393,15 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
 
       /* Shrink the beginning. */
       for (j = 0; j < decl->array_size; j++) {
-         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
-            break;
+         if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
+            if (patch_usage_mask &
+                BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
+               break;
+         }
+         else {
+            if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+               break;
+         }
 
          decl->mesa_index++;
          decl->array_size--;
@@ -2377,8 +2410,15 @@ shrink_array_declarations(struct array_decl *arrays, unsigned count,
 
       /* Shrink the end. */
       for (j = decl->array_size-1; j >= 0; j--) {
-         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
-            break;
+         if (decl->mesa_index >= VARYING_SLOT_PATCH0) {
+            if (patch_usage_mask &
+                BITFIELD64_BIT(decl->mesa_index - VARYING_SLOT_PATCH0 + j))
+               break;
+         }
+         else {
+            if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+               break;
+         }
 
          decl->array_size--;
       }
@@ -2391,22 +2431,34 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
    ir_constant *index;
    st_src_reg src;
    int element_size = type_size(ir->type);
-   bool is_2D_input;
+   bool is_2D = false;
 
    index = ir->array_index->constant_expression_value();
 
    ir->array->accept(this);
    src = this->result;
 
-   is_2D_input = this->prog->Target == GL_GEOMETRY_PROGRAM_NV &&
-                 src.file == PROGRAM_INPUT &&
-                 ir->array->ir_type != ir_type_dereference_array;
+   if (ir->array->ir_type != ir_type_dereference_array) {
+      switch (this->prog->Target) {
+      case GL_TESS_CONTROL_PROGRAM_NV:
+         is_2D = (src.file == PROGRAM_INPUT || src.file == PROGRAM_OUTPUT) &&
+                 !ir->variable_referenced()->data.patch;
+         break;
+      case GL_TESS_EVALUATION_PROGRAM_NV:
+         is_2D = src.file == PROGRAM_INPUT &&
+                 !ir->variable_referenced()->data.patch;
+         break;
+      case GL_GEOMETRY_PROGRAM_NV:
+         is_2D = src.file == PROGRAM_INPUT;
+         break;
+      }
+   }
 
-   if (is_2D_input)
+   if (is_2D)
       element_size = 1;
 
    if (index) {
-      if (is_2D_input) {
+      if (is_2D) {
          src.index2D = index->value.i[0];
          src.has_index2 = true;
       } else
@@ -2433,7 +2485,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
       /* If there was already a relative address register involved, add the
        * new and the old together to get the new offset.
        */
-      if (!is_2D_input && src.reladdr != NULL) {
+      if (!is_2D && src.reladdr != NULL) {
          st_src_reg accum_reg = get_temp(native_integers ?
                                 glsl_type::int_type : glsl_type::float_type);
 
@@ -2443,7 +2495,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
          index_reg = accum_reg;
       }
 
-      if (is_2D_input) {
+      if (is_2D) {
          src.reladdr2 = ralloc(mem_ctx, st_src_reg);
          memcpy(src.reladdr2, &index_reg, sizeof(index_reg));
          src.index2D = 0;
@@ -3430,7 +3482,10 @@ glsl_to_tgsi_visitor::visit(ir_end_primitive *ir)
 void
 glsl_to_tgsi_visitor::visit(ir_barrier *ir)
 {
-   unreachable("Not implemented!");
+   assert(this->prog->Target == GL_TESS_CONTROL_PROGRAM_NV ||
+          this->prog->Target == GL_COMPUTE_PROGRAM_NV);
+
+   emit_asm(ir, TGSI_OPCODE_BARRIER);
 }
 
 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
@@ -3553,7 +3608,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
 {
    int tempWritesSize = 0;
    unsigned *tempWrites = NULL;
-   unsigned outputWrites[MAX_PROGRAM_OUTPUTS];
+   unsigned outputWrites[VARYING_SLOT_TESS_MAX];
 
    memset(outputWrites, 0, sizeof(outputWrites));
 
@@ -3561,8 +3616,8 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
       unsigned prevWriteMask = 0;
 
       /* Give up if we encounter relative addressing or flow control. */
-      if (inst->dst[0].reladdr ||
-          inst->dst[1].reladdr ||
+      if (inst->dst[0].reladdr || inst->dst[0].reladdr2 ||
+          inst->dst[1].reladdr || inst->dst[1].reladdr2 ||
           tgsi_get_opcode_info(inst->op)->is_branch ||
           inst->op == TGSI_OPCODE_BGNSUB ||
           inst->op == TGSI_OPCODE_CONT ||
@@ -3573,7 +3628,7 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
       }
 
       if (inst->dst[0].file == PROGRAM_OUTPUT) {
-         assert(inst->dst[0].index < MAX_PROGRAM_OUTPUTS);
+         assert(inst->dst[0].index < (signed)ARRAY_SIZE(outputWrites));
          prevWriteMask = outputWrites[inst->dst[0].index];
          outputWrites[inst->dst[0].index] |= inst->dst[0].writemask;
       } else if (inst->dst[0].file == PROGRAM_TEMPORARY) {
@@ -3940,6 +3995,7 @@ glsl_to_tgsi_visitor::copy_propagate(void)
           !(inst->dst[0].file == inst->src[0].file &&
              inst->dst[0].index == inst->src[0].index) &&
           !inst->dst[0].reladdr &&
+          !inst->dst[0].reladdr2 &&
           !inst->saturate &&
           inst->src[0].file != PROGRAM_ARRAY &&
           !inst->src[0].reladdr &&
@@ -4527,6 +4583,14 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
    TGSI_SEMANTIC_SAMPLEID,
    TGSI_SEMANTIC_SAMPLEPOS,
    TGSI_SEMANTIC_SAMPLEMASK,
+
+   /* Tessellation shaders
+    */
+   TGSI_SEMANTIC_TESSCOORD,
+   TGSI_SEMANTIC_VERTICESIN,
+   TGSI_SEMANTIC_PRIMID,
+   TGSI_SEMANTIC_TESSOUTER,
+   TGSI_SEMANTIC_TESSINNER,
 };
 
 /**
@@ -4651,6 +4715,9 @@ dst_register(struct st_translate *t, gl_register_file file, unsigned index,
       if (!array_id) {
          if (t->procType == TGSI_PROCESSOR_FRAGMENT)
             assert(index < FRAG_RESULT_MAX);
+         else if (t->procType == TGSI_PROCESSOR_TESS_CTRL ||
+                  t->procType == TGSI_PROCESSOR_TESS_EVAL)
+            assert(index < VARYING_SLOT_TESS_MAX);
          else
             assert(index < VARYING_SLOT_MAX);
 
@@ -4790,6 +4857,14 @@ translate_dst(struct st_translate *t,
       dst = ureg_dst_indirect(dst, ureg_src(t->address[0]));
    }
 
+   if (dst_reg->has_index2) {
+      if (dst_reg->reladdr2)
+         dst = ureg_dst_dimension_indirect(dst, ureg_src(t->address[1]),
+                                           dst_reg->index2D);
+      else
+         dst = ureg_dst_dimension(dst, dst_reg->index2D);
+   }
+
    return dst;
 }
 
@@ -5271,6 +5346,8 @@ st_translate_program(
           TGSI_SEMANTIC_VERTEXID_NOBASE);
    assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_BASE_VERTEX] ==
           TGSI_SEMANTIC_BASEVERTEX);
+   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_TESS_COORD] ==
+          TGSI_SEMANTIC_TESSCOORD);
 
    t = CALLOC_STRUCT(st_translate);
    if (!t) {
@@ -5313,6 +5390,8 @@ st_translate_program(
       }
       break;
    case TGSI_PROCESSOR_GEOMETRY:
+   case TGSI_PROCESSOR_TESS_EVAL:
+   case TGSI_PROCESSOR_TESS_CTRL:
       for (i = 0; i < numInputs; i++) {
          unsigned array_id = 0;
          unsigned array_size;
@@ -5347,6 +5426,8 @@ st_translate_program(
    case TGSI_PROCESSOR_FRAGMENT:
       break;
    case TGSI_PROCESSOR_GEOMETRY:
+   case TGSI_PROCESSOR_TESS_EVAL:
+   case TGSI_PROCESSOR_TESS_CTRL:
    case TGSI_PROCESSOR_VERTEX:
       for (i = 0; i < numOutputs; i++) {
          unsigned array_id = 0;
@@ -5461,6 +5542,7 @@ st_translate_program(
                struct pipe_screen *pscreen = st->pipe->screen;
                assert(procType == TGSI_PROCESSOR_VERTEX);
                assert(pscreen->get_shader_param(pscreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS));
+               (void) pscreen;
                if (!ctx->Const.NativeIntegers) {
                   struct ureg_dst temp = ureg_DECL_local_temporary(t->ureg);
                   ureg_U2F( t->ureg, ureg_writemask(temp, TGSI_WRITEMASK_X), t->systemValues[i]);
@@ -5611,25 +5693,6 @@ out:
 /* ----------------------------- End TGSI code ------------------------------ */
 
 
-static unsigned
-shader_stage_to_ptarget(gl_shader_stage stage)
-{
-   switch (stage) {
-   case MESA_SHADER_VERTEX:
-      return PIPE_SHADER_VERTEX;
-   case MESA_SHADER_FRAGMENT:
-      return PIPE_SHADER_FRAGMENT;
-   case MESA_SHADER_GEOMETRY:
-      return PIPE_SHADER_GEOMETRY;
-   case MESA_SHADER_COMPUTE:
-      return PIPE_SHADER_COMPUTE;
-   }
-
-   assert(!"should not be reached");
-   return PIPE_SHADER_VERTEX;
-}
-
-
 /**
  * Convert a shader's GLSL IR into a Mesa gl_program, although without
  * generating Mesa IR.
@@ -5646,7 +5709,7 @@ get_mesa_program(struct gl_context *ctx,
    struct gl_shader_compiler_options *options =
          &ctx->Const.ShaderCompilerOptions[_mesa_shader_enum_to_shader_stage(shader->Type)];
    struct pipe_screen *pscreen = ctx->st->pipe->screen;
-   unsigned ptarget = shader_stage_to_ptarget(shader->Stage);
+   unsigned ptarget = st_shader_stage_to_ptarget(shader->Stage);
 
    validate_ir_tree(shader->ir);
 
@@ -5673,7 +5736,7 @@ get_mesa_program(struct gl_context *ctx,
                                                prog->Parameters);
 
    /* Remove reads from output registers. */
-   lower_output_reads(shader->ir);
+   lower_output_reads(shader->Stage, shader->ir);
 
    /* Emit intermediate IR for main(). */
    visit_exec_list(shader->ir, v);
@@ -5721,7 +5784,11 @@ get_mesa_program(struct gl_context *ctx,
 
    /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. */
    v->simplify_cmp();
-   v->copy_propagate();
+
+   if (shader->Type != GL_TESS_CONTROL_SHADER &&
+       shader->Type != GL_TESS_EVALUATION_SHADER)
+      v->copy_propagate();
+
    while (v->eliminate_dead_code());
 
    v->merge_two_dsts();
@@ -5745,9 +5812,9 @@ get_mesa_program(struct gl_context *ctx,
 
    do_set_program_inouts(shader->ir, prog, shader->Stage);
    shrink_array_declarations(v->input_arrays, v->num_input_arrays,
-                             prog->InputsRead);
+                             prog->InputsRead, prog->PatchInputsRead);
    shrink_array_declarations(v->output_arrays, v->num_output_arrays,
-                             prog->OutputsWritten);
+                             prog->OutputsWritten, prog->PatchOutputsWritten);
    count_resources(v, prog);
 
    /* This must be done before the uniform storage is associated. */
@@ -5776,6 +5843,8 @@ get_mesa_program(struct gl_context *ctx,
    struct st_vertex_program *stvp;
    struct st_fragment_program *stfp;
    struct st_geometry_program *stgp;
+   struct st_tessctrl_program *sttcp;
+   struct st_tesseval_program *sttep;
 
    switch (shader->Type) {
    case GL_VERTEX_SHADER:
@@ -5790,6 +5859,14 @@ get_mesa_program(struct gl_context *ctx,
       stgp = (struct st_geometry_program *)prog;
       stgp->glsl_to_tgsi = v;
       break;
+   case GL_TESS_CONTROL_SHADER:
+      sttcp = (struct st_tessctrl_program *)prog;
+      sttcp->glsl_to_tgsi = v;
+      break;
+   case GL_TESS_EVALUATION_SHADER:
+      sttep = (struct st_tesseval_program *)prog;
+      sttep->glsl_to_tgsi = v;
+      break;
    default:
       assert(!"should not be reached");
       return NULL;
@@ -5800,6 +5877,71 @@ get_mesa_program(struct gl_context *ctx,
 
 extern "C" {
 
+static void
+st_dump_program_for_shader_db(struct gl_context *ctx,
+                              struct gl_shader_program *prog)
+{
+   /* Dump only successfully compiled and linked shaders to the specified
+    * file. This is for shader-db.
+    *
+    * These options allow some pre-processing of shaders while dumping,
+    * because some apps have ill-formed shaders.
+    */
+   const char *dump_filename = os_get_option("ST_DUMP_SHADERS");
+   const char *insert_directives = os_get_option("ST_DUMP_INSERT");
+
+   if (dump_filename && prog->Name != 0) {
+      FILE *f = fopen(dump_filename, "a");
+
+      if (f) {
+         for (unsigned i = 0; i < prog->NumShaders; i++) {
+            const struct gl_shader *sh = prog->Shaders[i];
+            const char *source;
+            bool skip_version = false;
+
+            if (!sh)
+               continue;
+
+            source = sh->Source;
+
+            /* This string mustn't be changed. shader-db uses it to find
+             * where the shader begins.
+             */
+            fprintf(f, "GLSL %s shader %d source for linked program %d:\n",
+                    _mesa_shader_stage_to_string(sh->Stage),
+                    i, prog->Name);
+
+            /* Dump the forced version if set. */
+            if (ctx->Const.ForceGLSLVersion) {
+               fprintf(f, "#version %i\n", ctx->Const.ForceGLSLVersion);
+               skip_version = true;
+            }
+
+            /* Insert directives (optional). */
+            if (insert_directives) {
+               if (!ctx->Const.ForceGLSLVersion && prog->Version)
+                  fprintf(f, "#version %i\n", prog->Version);
+               fprintf(f, "%s\n", insert_directives);
+               skip_version = true;
+            }
+
+            if (skip_version && strncmp(source, "#version ", 9) == 0) {
+               const char *next_line = strstr(source, "\n");
+
+               if (next_line)
+                  source = next_line + 1;
+               else
+                  continue;
+            }
+
+            fprintf(f, "%s", source);
+            fprintf(f, "\n");
+         }
+         fclose(f);
+      }
+   }
+}
+
 /**
  * Link a shader.
  * Called via ctx->Driver.LinkShader()
@@ -5821,7 +5963,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       gl_shader_stage stage = _mesa_shader_enum_to_shader_stage(prog->_LinkedShaders[i]->Type);
       const struct gl_shader_compiler_options *options =
             &ctx->Const.ShaderCompilerOptions[stage];
-      unsigned ptarget = shader_stage_to_ptarget(stage);
+      unsigned ptarget = st_shader_stage_to_ptarget(stage);
       bool have_dround = pscreen->get_shader_param(pscreen, ptarget,
                                                    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED);
       bool have_dfrexp = pscreen->get_shader_param(pscreen, ptarget,
@@ -5832,7 +5974,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
        */
       if (options->EmitNoIndirectInput || options->EmitNoIndirectOutput ||
           options->EmitNoIndirectTemp || options->EmitNoIndirectUniform) {
-         lower_variable_index_to_cond_assign(ir,
+         lower_variable_index_to_cond_assign(prog->_LinkedShaders[i]->Stage, ir,
                                              options->EmitNoIndirectInput,
                                              options->EmitNoIndirectOutput,
                                              options->EmitNoIndirectTemp,
@@ -5920,6 +6062,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       _mesa_reference_program(ctx, &linked_prog, NULL);
    }
 
+   st_dump_program_for_shader_db(ctx, prog);
    return GL_TRUE;
 }
 
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index a2dee6298fa..2e2c8ffaed9 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -368,6 +368,7 @@ st_visual_to_context_mode(const struct st_visual *visual,
 
       mode->rgbBits = mode->redBits +
          mode->greenBits + mode->blueBits + mode->alphaBits;
+      mode->sRGBCapable = util_format_is_srgb(visual->color_format);
    }
 
    if (visual->depth_stencil_format != PIPE_FORMAT_NONE) {
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index fa792bc349b..e62dd7aab80 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -163,6 +163,68 @@ st_release_gp_variants(struct st_context *st, struct st_geometry_program *stgp)
 }
 
 
+/**
+ * Delete a tessellation control program variant.  Note the caller must unlink
+ * the variant from the linked list.
+ */
+static void
+delete_tcp_variant(struct st_context *st, struct st_tcp_variant *tcpv)
+{
+   if (tcpv->driver_shader)
+      cso_delete_tessctrl_shader(st->cso_context, tcpv->driver_shader);
+
+   free(tcpv);
+}
+
+
+/**
+ * Free all variants of a tessellation control program.
+ */
+void
+st_release_tcp_variants(struct st_context *st, struct st_tessctrl_program *sttcp)
+{
+   struct st_tcp_variant *tcpv;
+
+   for (tcpv = sttcp->variants; tcpv; ) {
+      struct st_tcp_variant *next = tcpv->next;
+      delete_tcp_variant(st, tcpv);
+      tcpv = next;
+   }
+
+   sttcp->variants = NULL;
+}
+
+
+/**
+ * Delete a tessellation evaluation program variant.  Note the caller must
+ * unlink the variant from the linked list.
+ */
+static void
+delete_tep_variant(struct st_context *st, struct st_tep_variant *tepv)
+{
+   if (tepv->driver_shader)
+      cso_delete_tesseval_shader(st->cso_context, tepv->driver_shader);
+
+   free(tepv);
+}
+
+
+/**
+ * Free all variants of a tessellation evaluation program.
+ */
+void
+st_release_tep_variants(struct st_context *st, struct st_tesseval_program *sttep)
+{
+   struct st_tep_variant *tepv;
+
+   for (tepv = sttep->variants; tepv; ) {
+      struct st_tep_variant *next = tepv->next;
+      delete_tep_variant(st, tepv);
+      tepv = next;
+   }
+
+   sttep->variants = NULL;
+}
 
 
 /**
@@ -870,61 +932,52 @@ st_get_fp_variant(struct st_context *st,
 
 
 /**
- * Translate a geometry program to create a new variant.
+ * Translate a program. This is common code for geometry and tessellation
+ * shaders.
  */
-static struct st_gp_variant *
-st_translate_geometry_program(struct st_context *st,
-                              struct st_geometry_program *stgp,
-                              const struct st_gp_variant_key *key)
+static void
+st_translate_program_common(struct st_context *st,
+                            struct gl_program *prog,
+                            struct glsl_to_tgsi_visitor *glsl_to_tgsi,
+                            struct ureg_program *ureg,
+                            unsigned tgsi_processor,
+                            struct pipe_shader_state *out_state)
 {
-   GLuint inputSlotToAttr[VARYING_SLOT_MAX];
-   GLuint inputMapping[VARYING_SLOT_MAX];
-   GLuint outputSlotToAttr[VARYING_SLOT_MAX];
-   GLuint outputMapping[VARYING_SLOT_MAX];
-   struct pipe_context *pipe = st->pipe;
+   GLuint inputSlotToAttr[VARYING_SLOT_TESS_MAX];
+   GLuint inputMapping[VARYING_SLOT_TESS_MAX];
+   GLuint outputSlotToAttr[VARYING_SLOT_TESS_MAX];
+   GLuint outputMapping[VARYING_SLOT_TESS_MAX];
    GLuint attr;
 
-   uint gs_num_inputs = 0;
-
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS];
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   uint num_inputs = 0;
 
-   ubyte gs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
-   ubyte gs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
-   uint gs_num_outputs = 0;
+   ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
+   uint num_outputs = 0;
 
    GLint i;
-   struct ureg_program *ureg;
-   struct pipe_shader_state state = {0};
-   struct st_gp_variant *gpv;
-
-   gpv = CALLOC_STRUCT(st_gp_variant);
-   if (!gpv)
-      return NULL;
-
-   ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
-   if (ureg == NULL) {
-      free(gpv);
-      return NULL;
-   }
 
    memset(inputSlotToAttr, 0, sizeof(inputSlotToAttr));
    memset(inputMapping, 0, sizeof(inputMapping));
    memset(outputSlotToAttr, 0, sizeof(outputSlotToAttr));
    memset(outputMapping, 0, sizeof(outputMapping));
+   memset(out_state, 0, sizeof(*out_state));
 
    /*
     * Convert Mesa program inputs to TGSI input register semantics.
     */
    for (attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      if ((stgp->Base.Base.InputsRead & BITFIELD64_BIT(attr)) != 0) {
-         const GLuint slot = gs_num_inputs++;
+      if ((prog->InputsRead & BITFIELD64_BIT(attr)) != 0) {
+         const GLuint slot = num_inputs++;
 
          inputMapping[attr] = slot;
          inputSlotToAttr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_PRIMITIVE_ID:
+            assert(tgsi_processor == TGSI_PROCESSOR_GEOMETRY);
             input_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
             input_semantic_index[slot] = 0;
             break;
@@ -976,19 +1029,33 @@ st_translate_geometry_program(struct st_context *st,
             /* fall through */
          case VARYING_SLOT_VAR0:
          default:
-            assert(attr >= VARYING_SLOT_VAR0 && attr < VARYING_SLOT_MAX);
+            assert(attr >= VARYING_SLOT_VAR0 ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
             input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
             input_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
-         break;
+            break;
          }
       }
    }
 
+   /* Also add patch inputs. */
+   for (attr = 0; attr < 32; attr++) {
+      if (prog->PatchInputsRead & (1 << attr)) {
+         GLuint slot = num_inputs++;
+         GLuint patch_attr = VARYING_SLOT_PATCH0 + attr;
+
+         inputMapping[patch_attr] = slot;
+         inputSlotToAttr[slot] = patch_attr;
+         input_semantic_name[slot] = TGSI_SEMANTIC_PATCH;
+         input_semantic_index[slot] = attr;
+      }
+   }
+
    /* initialize output semantics to defaults */
    for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
-      gs_output_semantic_name[i] = TGSI_SEMANTIC_GENERIC;
-      gs_output_semantic_index[i] = 0;
+      output_semantic_name[i] = TGSI_SEMANTIC_GENERIC;
+      output_semantic_index[i] = 0;
    }
 
    /*
@@ -996,8 +1063,8 @@ st_translate_geometry_program(struct st_context *st,
     * mapping and the semantic information for each output.
     */
    for (attr = 0; attr < VARYING_SLOT_MAX; attr++) {
-      if (stgp->Base.Base.OutputsWritten & BITFIELD64_BIT(attr)) {
-         GLuint slot = gs_num_outputs++;
+      if (prog->OutputsWritten & BITFIELD64_BIT(attr)) {
+         GLuint slot = num_outputs++;
 
          outputMapping[attr] = slot;
          outputSlotToAttr[slot] = attr;
@@ -1005,56 +1072,64 @@ st_translate_geometry_program(struct st_context *st,
          switch (attr) {
          case VARYING_SLOT_POS:
             assert(slot == 0);
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_COL0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_COL1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_COLOR;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_BFC0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_BFC1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_BCOLOR;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_FOGC:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_FOG;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_PSIZ:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_PSIZE;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_VERTEX:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPVERTEX;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_DIST0:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_CLIP_DIST1:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
-            gs_output_semantic_index[slot] = 1;
+            output_semantic_name[slot] = TGSI_SEMANTIC_CLIPDIST;
+            output_semantic_index[slot] = 1;
             break;
          case VARYING_SLOT_LAYER:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_LAYER;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_PRIMITIVE_ID:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_PRIMID;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_VIEWPORT:
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
-            gs_output_semantic_index[slot] = 0;
+            output_semantic_name[slot] = TGSI_SEMANTIC_VIEWPORT_INDEX;
+            output_semantic_index[slot] = 0;
+            break;
+         case VARYING_SLOT_TESS_LEVEL_OUTER:
+            output_semantic_name[slot] = TGSI_SEMANTIC_TESSOUTER;
+            output_semantic_index[slot] = 0;
+            break;
+         case VARYING_SLOT_TESS_LEVEL_INNER:
+            output_semantic_name[slot] = TGSI_SEMANTIC_TESSINNER;
+            output_semantic_index[slot] = 0;
             break;
          case VARYING_SLOT_TEX0:
          case VARYING_SLOT_TEX1:
@@ -1065,36 +1140,44 @@ st_translate_geometry_program(struct st_context *st,
          case VARYING_SLOT_TEX6:
          case VARYING_SLOT_TEX7:
             if (st->needs_texcoord_semantic) {
-               gs_output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
-               gs_output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
+               output_semantic_name[slot] = TGSI_SEMANTIC_TEXCOORD;
+               output_semantic_index[slot] = attr - VARYING_SLOT_TEX0;
                break;
             }
             /* fall through */
          case VARYING_SLOT_VAR0:
          default:
-            assert(slot < ARRAY_SIZE(gs_output_semantic_name));
-            assert(attr >= VARYING_SLOT_VAR0);
-            gs_output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            gs_output_semantic_index[slot] =
+            assert(slot < ARRAY_SIZE(output_semantic_name));
+            assert(attr >= VARYING_SLOT_VAR0 ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
+            output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
+            output_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
-         break;
+            break;
          }
       }
    }
 
-   ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType);
-   ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType);
-   ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES,
-                 stgp->Base.VerticesOut);
-   ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations);
+   /* Also add patch outputs. */
+   for (attr = 0; attr < 32; attr++) {
+      if (prog->PatchOutputsWritten & (1 << attr)) {
+         GLuint slot = num_outputs++;
+         GLuint patch_attr = VARYING_SLOT_PATCH0 + attr;
+
+         outputMapping[patch_attr] = slot;
+         outputSlotToAttr[slot] = patch_attr;
+         output_semantic_name[slot] = TGSI_SEMANTIC_PATCH;
+         output_semantic_index[slot] = attr;
+      }
+   }
 
    st_translate_program(st->ctx,
-                        TGSI_PROCESSOR_GEOMETRY,
+                        tgsi_processor,
                         ureg,
-                        stgp->glsl_to_tgsi,
-                        &stgp->Base.Base,
+                        glsl_to_tgsi,
+                        prog,
                         /* inputs */
-                        gs_num_inputs,
+                        num_inputs,
                         inputMapping,
                         inputSlotToAttr,
                         input_semantic_name,
@@ -1102,30 +1185,64 @@ st_translate_geometry_program(struct st_context *st,
                         NULL,
                         NULL,
                         /* outputs */
-                        gs_num_outputs,
+                        num_outputs,
                         outputMapping,
                         outputSlotToAttr,
-                        gs_output_semantic_name,
-                        gs_output_semantic_index,
+                        output_semantic_name,
+                        output_semantic_index,
                         FALSE,
                         FALSE);
 
-   state.tokens = ureg_get_tokens(ureg, NULL);
+   out_state->tokens = ureg_get_tokens(ureg, NULL);
    ureg_destroy(ureg);
 
-   st_translate_stream_output_info(stgp->glsl_to_tgsi,
+   st_translate_stream_output_info(glsl_to_tgsi,
                                    outputMapping,
-                                   &state.stream_output);
+                                   &out_state->stream_output);
 
    if ((ST_DEBUG & DEBUG_TGSI) && (ST_DEBUG & DEBUG_MESA)) {
-      _mesa_print_program(&stgp->Base.Base);
+      _mesa_print_program(prog);
       debug_printf("\n");
    }
 
    if (ST_DEBUG & DEBUG_TGSI) {
-      tgsi_dump(state.tokens, 0);
+      tgsi_dump(out_state->tokens, 0);
       debug_printf("\n");
    }
+}
+
+
+/**
+ * Translate a geometry program to create a new variant.
+ */
+static struct st_gp_variant *
+st_translate_geometry_program(struct st_context *st,
+                              struct st_geometry_program *stgp,
+                              const struct st_gp_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_gp_variant *gpv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
+   if (ureg == NULL)
+      return NULL;
+
+   ureg_property(ureg, TGSI_PROPERTY_GS_INPUT_PRIM, stgp->Base.InputType);
+   ureg_property(ureg, TGSI_PROPERTY_GS_OUTPUT_PRIM, stgp->Base.OutputType);
+   ureg_property(ureg, TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES,
+                 stgp->Base.VerticesOut);
+   ureg_property(ureg, TGSI_PROPERTY_GS_INVOCATIONS, stgp->Base.Invocations);
+
+   st_translate_program_common(st, &stgp->Base.Base, stgp->glsl_to_tgsi, ureg,
+                               TGSI_PROCESSOR_GEOMETRY, &state);
+
+   gpv = CALLOC_STRUCT(st_gp_variant);
+   if (!gpv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
 
    /* fill in new variant */
    gpv->driver_shader = pipe->create_gs_state(pipe, &state);
@@ -1167,6 +1284,168 @@ st_get_gp_variant(struct st_context *st,
 }
 
 
+/**
+ * Translate a tessellation control program to create a new variant.
+ */
+static struct st_tcp_variant *
+st_translate_tessctrl_program(struct st_context *st,
+                              struct st_tessctrl_program *sttcp,
+                              const struct st_tcp_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_tcp_variant *tcpv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_CTRL, pipe->screen);
+   if (ureg == NULL) {
+      return NULL;
+   }
+
+   ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT,
+                 sttcp->Base.VerticesOut);
+
+   st_translate_program_common(st, &sttcp->Base.Base, sttcp->glsl_to_tgsi,
+                               ureg, TGSI_PROCESSOR_TESS_CTRL, &state);
+
+   tcpv = CALLOC_STRUCT(st_tcp_variant);
+   if (!tcpv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
+
+   /* fill in new variant */
+   tcpv->driver_shader = pipe->create_tcs_state(pipe, &state);
+   tcpv->key = *key;
+
+   ureg_free_tokens(state.tokens);
+   return tcpv;
+}
+
+
+/**
+ * Get/create tessellation control program variant.
+ */
+struct st_tcp_variant *
+st_get_tcp_variant(struct st_context *st,
+                  struct st_tessctrl_program *sttcp,
+                  const struct st_tcp_variant_key *key)
+{
+   struct st_tcp_variant *tcpv;
+
+   /* Search for existing variant */
+   for (tcpv = sttcp->variants; tcpv; tcpv = tcpv->next) {
+      if (memcmp(&tcpv->key, key, sizeof(*key)) == 0) {
+         break;
+      }
+   }
+
+   if (!tcpv) {
+      /* create new */
+      tcpv = st_translate_tessctrl_program(st, sttcp, key);
+      if (tcpv) {
+         /* insert into list */
+         tcpv->next = sttcp->variants;
+         sttcp->variants = tcpv;
+      }
+   }
+
+   return tcpv;
+}
+
+
+/**
+ * Translate a tessellation evaluation program to create a new variant.
+ */
+static struct st_tep_variant *
+st_translate_tesseval_program(struct st_context *st,
+                              struct st_tesseval_program *sttep,
+                              const struct st_tep_variant_key *key)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct ureg_program *ureg;
+   struct st_tep_variant *tepv;
+   struct pipe_shader_state state;
+
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_TESS_EVAL, pipe->screen);
+   if (ureg == NULL) {
+      return NULL;
+   }
+
+   if (sttep->Base.PrimitiveMode == GL_ISOLINES)
+      ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, GL_LINES);
+   else
+      ureg_property(ureg, TGSI_PROPERTY_TES_PRIM_MODE, sttep->Base.PrimitiveMode);
+
+   switch (sttep->Base.Spacing) {
+   case GL_EQUAL:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING, PIPE_TESS_SPACING_EQUAL);
+      break;
+   case GL_FRACTIONAL_EVEN:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING,
+                    PIPE_TESS_SPACING_FRACTIONAL_EVEN);
+      break;
+   case GL_FRACTIONAL_ODD:
+      ureg_property(ureg, TGSI_PROPERTY_TES_SPACING,
+                    PIPE_TESS_SPACING_FRACTIONAL_ODD);
+      break;
+   default:
+      assert(0);
+   }
+
+   ureg_property(ureg, TGSI_PROPERTY_TES_VERTEX_ORDER_CW,
+                 sttep->Base.VertexOrder == GL_CW);
+   ureg_property(ureg, TGSI_PROPERTY_TES_POINT_MODE, sttep->Base.PointMode);
+
+   st_translate_program_common(st, &sttep->Base.Base, sttep->glsl_to_tgsi,
+                               ureg, TGSI_PROCESSOR_TESS_EVAL, &state);
+
+   tepv = CALLOC_STRUCT(st_tep_variant);
+   if (!tepv) {
+      ureg_free_tokens(state.tokens);
+      return NULL;
+   }
+
+   /* fill in new variant */
+   tepv->driver_shader = pipe->create_tes_state(pipe, &state);
+   tepv->key = *key;
+
+   ureg_free_tokens(state.tokens);
+   return tepv;
+}
+
+
+/**
+ * Get/create tessellation evaluation program variant.
+ */
+struct st_tep_variant *
+st_get_tep_variant(struct st_context *st,
+                  struct st_tesseval_program *sttep,
+                  const struct st_tep_variant_key *key)
+{
+   struct st_tep_variant *tepv;
+
+   /* Search for existing variant */
+   for (tepv = sttep->variants; tepv; tepv = tepv->next) {
+      if (memcmp(&tepv->key, key, sizeof(*key)) == 0) {
+         break;
+      }
+   }
+
+   if (!tepv) {
+      /* create new */
+      tepv = st_translate_tesseval_program(st, sttep, key);
+      if (tepv) {
+         /* insert into list */
+         tepv->next = sttep->variants;
+         sttep->variants = tepv;
+      }
+   }
+
+   return tepv;
+}
+
+
 /**
  * Vert/Geom/Frag programs have per-context variants.  Free all the
  * variants attached to the given program which match the given context.
@@ -1240,6 +1519,48 @@ destroy_program_variants(struct st_context *st, struct gl_program *program)
          }
       }
       break;
+   case GL_TESS_CONTROL_PROGRAM_NV:
+      {
+         struct st_tessctrl_program *sttcp =
+            (struct st_tessctrl_program *) program;
+         struct st_tcp_variant *tcpv, **prevPtr = &sttcp->variants;
+
+         for (tcpv = sttcp->variants; tcpv; ) {
+            struct st_tcp_variant *next = tcpv->next;
+            if (tcpv->key.st == st) {
+               /* unlink from list */
+               *prevPtr = next;
+               /* destroy this variant */
+               delete_tcp_variant(st, tcpv);
+            }
+            else {
+               prevPtr = &tcpv->next;
+            }
+            tcpv = next;
+         }
+      }
+      break;
+   case GL_TESS_EVALUATION_PROGRAM_NV:
+      {
+         struct st_tesseval_program *sttep =
+            (struct st_tesseval_program *) program;
+         struct st_tep_variant *tepv, **prevPtr = &sttep->variants;
+
+         for (tepv = sttep->variants; tepv; ) {
+            struct st_tep_variant *next = tepv->next;
+            if (tepv->key.st == st) {
+               /* unlink from list */
+               *prevPtr = next;
+               /* destroy this variant */
+               delete_tep_variant(st, tepv);
+            }
+            else {
+               prevPtr = &tepv->next;
+            }
+            tepv = next;
+         }
+      }
+      break;
    default:
       _mesa_problem(NULL, "Unexpected program target 0x%x in "
                     "destroy_program_variants_cb()", program->Target);
@@ -1276,6 +1597,8 @@ destroy_shader_program_variants_cb(GLuint key, void *data, void *userData)
    case GL_VERTEX_SHADER:
    case GL_FRAGMENT_SHADER:
    case GL_GEOMETRY_SHADER:
+   case GL_TESS_CONTROL_SHADER:
+   case GL_TESS_EVALUATION_SHADER:
       {
          destroy_program_variants(st, shader->Program);
       }
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index bb77eb6ed65..7013993fe38 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -188,7 +188,7 @@ struct st_gp_variant_key
  */
 struct st_gp_variant
 {
-   /* Parameters which generated this translated version of a vertex */
+   /* Parameters which generated this variant. */
    struct st_gp_variant_key key;
 
    void *driver_shader;
@@ -210,6 +210,76 @@ struct st_geometry_program
 
 
 
+/** Tessellation control program variant key */
+struct st_tcp_variant_key
+{
+   struct st_context *st;          /**< variants are per-context */
+   /* no other fields yet */
+};
+
+
+/**
+ * Tessellation control program variant.
+ */
+struct st_tcp_variant
+{
+   /* Parameters which generated this variant. */
+   struct st_tcp_variant_key key;
+
+   void *driver_shader;
+
+   struct st_tcp_variant *next;
+};
+
+
+/**
+ * Derived from Mesa gl_tess_ctrl_program:
+ */
+struct st_tessctrl_program
+{
+   struct gl_tess_ctrl_program Base;  /**< The Mesa tess ctrl program */
+   struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+
+   struct st_tcp_variant *variants;
+};
+
+
+
+/** Tessellation evaluation program variant key */
+struct st_tep_variant_key
+{
+   struct st_context *st;          /**< variants are per-context */
+   /* no other fields yet */
+};
+
+
+/**
+ * Tessellation evaluation program variant.
+ */
+struct st_tep_variant
+{
+   /* Parameters which generated this variant. */
+   struct st_tep_variant_key key;
+
+   void *driver_shader;
+
+   struct st_tep_variant *next;
+};
+
+
+/**
+ * Derived from Mesa gl_tess_eval_program:
+ */
+struct st_tesseval_program
+{
+   struct gl_tess_eval_program Base;  /**< The Mesa tess eval program */
+   struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+
+   struct st_tep_variant *variants;
+};
+
+
+
 static inline struct st_fragment_program *
 st_fragment_program( struct gl_fragment_program *fp )
 {
@@ -229,6 +299,18 @@ st_geometry_program( struct gl_geometry_program *gp )
    return (struct st_geometry_program *)gp;
 }
 
+static inline struct st_tessctrl_program *
+st_tessctrl_program( struct gl_tess_ctrl_program *tcp )
+{
+   return (struct st_tessctrl_program *)tcp;
+}
+
+static inline struct st_tesseval_program *
+st_tesseval_program( struct gl_tess_eval_program *tep )
+{
+   return (struct st_tesseval_program *)tep;
+}
+
 static inline void
 st_reference_vertprog(struct st_context *st,
                       struct st_vertex_program **ptr,
@@ -259,6 +341,26 @@ st_reference_fragprog(struct st_context *st,
                            (struct gl_program *) prog);
 }
 
+static inline void
+st_reference_tesscprog(struct st_context *st,
+                       struct st_tessctrl_program **ptr,
+                       struct st_tessctrl_program *prog)
+{
+   _mesa_reference_program(st->ctx,
+                           (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
+static inline void
+st_reference_tesseprog(struct st_context *st,
+                       struct st_tesseval_program **ptr,
+                       struct st_tesseval_program *prog)
+{
+   _mesa_reference_program(st->ctx,
+                           (struct gl_program **) ptr,
+                           (struct gl_program *) prog);
+}
+
 /**
  * This defines mapping from Mesa VARYING_SLOTs to TGSI GENERIC slots.
  */
@@ -302,6 +404,16 @@ st_get_gp_variant(struct st_context *st,
                   struct st_geometry_program *stgp,
                   const struct st_gp_variant_key *key);
 
+extern struct st_tcp_variant *
+st_get_tcp_variant(struct st_context *st,
+                   struct st_tessctrl_program *stgp,
+                   const struct st_tcp_variant_key *key);
+
+extern struct st_tep_variant *
+st_get_tep_variant(struct st_context *st,
+                   struct st_tesseval_program *stgp,
+                   const struct st_tep_variant_key *key);
+
 
 extern void
 st_prepare_vertex_program(struct gl_context *ctx,
@@ -324,6 +436,14 @@ extern void
 st_release_gp_variants(struct st_context *st,
                        struct st_geometry_program *stgp);
 
+extern void
+st_release_tcp_variants(struct st_context *st,
+                        struct st_tessctrl_program *stgp);
+
+extern void
+st_release_tep_variants(struct st_context *st,
+                        struct st_tesseval_program *stgp);
+
 extern void
 st_destroy_program_variants(struct st_context *st);
 
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 6beb21e3389..52b094330b9 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -462,6 +462,11 @@ st_texture_get_sampler_view(struct st_context *st,
    return free;
 }
 
+
+/**
+ * For the given texture object, release any sampler views which belong
+ * to the calling context.
+ */
 void
 st_texture_release_sampler_view(struct st_context *st,
                                 struct st_texture_object *stObj)
@@ -478,6 +483,11 @@ st_texture_release_sampler_view(struct st_context *st,
    }
 }
 
+
+/**
+ * Release all sampler views attached to the given texture object, regardless
+ * of the context.
+ */
 void
 st_texture_release_all_sampler_views(struct st_context *st,
                                      struct st_texture_object *stObj)
diff --git a/src/mesa/swrast/s_aaline.c b/src/mesa/swrast/s_aaline.c
index f3258e813a6..de5b42b9f6b 100644
--- a/src/mesa/swrast/s_aaline.c
+++ b/src/mesa/swrast/s_aaline.c
@@ -116,11 +116,11 @@ compute_plane(GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1,
    const GLfloat b = pz * py;
    const GLfloat c = px * px + py * py;
    const GLfloat d = -(a * x0 + b * y0 + c * z0);
-   if (a == 0.0 && b == 0.0 && c == 0.0 && d == 0.0) {
-      plane[0] = 0.0;
-      plane[1] = 0.0;
-      plane[2] = 1.0;
-      plane[3] = 0.0;
+   if (a == 0.0F && b == 0.0F && c == 0.0F && d == 0.0F) {
+      plane[0] = 0.0F;
+      plane[1] = 0.0F;
+      plane[2] = 1.0F;
+      plane[3] = 0.0F;
    }
    else {
       plane[0] = a;
@@ -135,9 +135,9 @@ compute_plane(GLfloat x0, GLfloat y0, GLfloat x1, GLfloat y1,
 static inline void
 constant_plane(GLfloat value, GLfloat plane[4])
 {
-   plane[0] = 0.0;
-   plane[1] = 0.0;
-   plane[2] = -1.0;
+   plane[0] = 0.0F;
+   plane[1] = 0.0F;
+   plane[2] = -1.0F;
    plane[3] = value;
 }
 
@@ -160,8 +160,8 @@ static inline GLfloat
 solve_plane_recip(GLfloat x, GLfloat y, const GLfloat plane[4])
 {
    const GLfloat denom = plane[3] + plane[0] * x + plane[1] * y;
-   if (denom == 0.0)
-      return 0.0;
+   if (denom == 0.0F)
+      return 0.0F;
    else
       return -plane[2] / denom;
 }
@@ -374,7 +374,7 @@ segment(struct gl_context *ctx,
       if (x0 < x1) {
          xLeft = x0 - line->halfWidth;
          xRight = x1 + line->halfWidth;
-         if (line->dy >= 0.0) {
+         if (line->dy >= 0.0F) {
             yBot = y0 - 3.0F * line->halfWidth;
             yTop = y0 + line->halfWidth;
          }
@@ -386,7 +386,7 @@ segment(struct gl_context *ctx,
       else {
          xLeft = x1 - line->halfWidth;
          xRight = x0 + line->halfWidth;
-         if (line->dy <= 0.0) {
+         if (line->dy <= 0.0F) {
             yBot = y1 - 3.0F * line->halfWidth;
             yTop = y1 + line->halfWidth;
          }
@@ -420,7 +420,7 @@ segment(struct gl_context *ctx,
       if (y0 < y1) {
          yBot = y0 - line->halfWidth;
          yTop = y1 + line->halfWidth;
-         if (line->dx >= 0.0) {
+         if (line->dx >= 0.0F) {
             xLeft = x0 - 3.0F * line->halfWidth;
             xRight = x0 + line->halfWidth;
          }
@@ -432,7 +432,7 @@ segment(struct gl_context *ctx,
       else {
          yBot = y1 - line->halfWidth;
          yTop = y0 + line->halfWidth;
-         if (line->dx <= 0.0) {
+         if (line->dx <= 0.0F) {
             xLeft = x1 - 3.0F * line->halfWidth;
             xRight = x1 + line->halfWidth;
          }
diff --git a/src/mesa/swrast/s_aalinetemp.h b/src/mesa/swrast/s_aalinetemp.h
index f1d078fd89b..bebb131a5d1 100644
--- a/src/mesa/swrast/s_aalinetemp.h
+++ b/src/mesa/swrast/s_aalinetemp.h
@@ -44,7 +44,7 @@ NAME(plot)(struct gl_context *ctx, struct LineInfo *line, int ix, int iy)
 
    (void) swrast;
 
-   if (coverage == 0.0)
+   if (coverage == 0.0F)
       return;
 
    line->span.end++;
@@ -123,7 +123,7 @@ NAME(line)(struct gl_context *ctx, const SWvertex *v0, const SWvertex *v1)
                                  ctx->Const.MinLineWidthAA,
                                  ctx->Const.MaxLineWidthAA);
 
-   if (line.len == 0.0 || IS_INF_OR_NAN(line.len))
+   if (line.len == 0.0F || IS_INF_OR_NAN(line.len))
       return;
 
    INIT_SPAN(line.span, GL_LINE);
diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c
index 9e029db25ce..2974deed41b 100644
--- a/src/mesa/swrast/s_atifragshader.c
+++ b/src/mesa/swrast/s_atifragshader.c
@@ -436,13 +436,13 @@ execute_shader(struct gl_context *ctx, const struct ati_fragment_shader *shader,
 		     for (i = 0; i < 3; i++) {
 			dst[optype][i] =
 			   (src[optype][2][i] >
-			    0.5) ? src[optype][0][i] : src[optype][1][i];
+			    0.5F) ? src[optype][0][i] : src[optype][1][i];
 		     }
 		  }
 		  else {
 		     dst[optype][3] =
 			(src[optype][2][3] >
-			 0.5) ? src[optype][0][3] : src[optype][1][3];
+			 0.5F) ? src[optype][0][3] : src[optype][1][3];
 		  }
 		  break;
 
diff --git a/src/mesa/swrast/s_copypix.c b/src/mesa/swrast/s_copypix.c
index 68c83e44e12..0dbccc0f61d 100644
--- a/src/mesa/swrast/s_copypix.c
+++ b/src/mesa/swrast/s_copypix.c
@@ -27,6 +27,7 @@
 #include "main/context.h"
 #include "main/condrender.h"
 #include "main/macros.h"
+#include "main/blit.h"
 #include "main/pixeltransfer.h"
 #include "main/imports.h"
 
@@ -51,20 +52,9 @@ regions_overlap(GLint srcx, GLint srcy,
                 GLint width, GLint height,
                 GLfloat zoomX, GLfloat zoomY)
 {
-   if (zoomX == 1.0 && zoomY == 1.0) {
-      /* no zoom */
-      if (srcx >= dstx + width || (srcx + width <= dstx)) {
-         return GL_FALSE;
-      }
-      else if (srcy < dsty) { /* this is OK */
-         return GL_FALSE;
-      }
-      else if (srcy > dsty + height) {
-         return GL_FALSE;
-      }
-      else {
-         return GL_TRUE;
-      }
+   if (zoomX == 1.0F && zoomY == 1.0F) {
+      return _mesa_regions_overlap(srcx, srcy, srcx + width, srcy + height,
+                                   dstx, dsty, dstx + width, dsty + height);
    }
    else {
       /* add one pixel of slop when zooming, just to be safe */
@@ -211,8 +201,8 @@ scale_and_bias_z(struct gl_context *ctx, GLuint width,
    GLuint i;
 
    if (depthMax <= 0xffffff &&
-       ctx->Pixel.DepthScale == 1.0 &&
-       ctx->Pixel.DepthBias == 0.0) {
+       ctx->Pixel.DepthScale == 1.0F &&
+       ctx->Pixel.DepthBias == 0.0F) {
       /* no scale or bias and no clamping and no worry of overflow */
       const GLfloat depthMaxF = ctx->DrawBuffer->_DepthMaxF;
       for (i = 0; i < width; i++) {
diff --git a/src/mesa/swrast/s_depth.c b/src/mesa/swrast/s_depth.c
index 134f897c039..ffadc05a732 100644
--- a/src/mesa/swrast/s_depth.c
+++ b/src/mesa/swrast/s_depth.c
@@ -419,8 +419,8 @@ _swrast_depth_bounds_test( struct gl_context *ctx, SWspan *span )
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    struct gl_renderbuffer *rb = fb->Attachment[BUFFER_DEPTH].Renderbuffer;
    GLubyte *zStart;
-   GLuint zMin = (GLuint) (ctx->Depth.BoundsMin * fb->_DepthMaxF + 0.5F);
-   GLuint zMax = (GLuint) (ctx->Depth.BoundsMax * fb->_DepthMaxF + 0.5F);
+   GLuint zMin = (GLuint)((double)ctx->Depth.BoundsMin * 0xffffffff);
+   GLuint zMax = (GLuint)((double)ctx->Depth.BoundsMax * 0xffffffff);
    GLubyte *mask = span->array->mask;
    const GLuint count = span->end;
    GLuint i;
@@ -444,6 +444,16 @@ _swrast_depth_bounds_test( struct gl_context *ctx, SWspan *span )
       zBufferVals = (const GLuint *) zStart;
    }
    else {
+      /* Round the bounds to the precision of the zbuffer. */
+      if (rb->Format == MESA_FORMAT_Z_UNORM16) {
+         zMin = (zMin & 0xffff0000) | (zMin >> 16);
+         zMax = (zMax & 0xffff0000) | (zMax >> 16);
+      } else {
+         /* 24 bits */
+         zMin = (zMin & 0xffffff00) | (zMin >> 24);
+         zMax = (zMax & 0xffffff00) | (zMax >> 24);
+      }
+
       /* unpack Z values into a temporary array */
       if (span->arrayMask & SPAN_XY) {
          get_z32_values(ctx, rb, count, span->array->x, span->array->y,
diff --git a/src/mesa/swrast/s_drawpix.c b/src/mesa/swrast/s_drawpix.c
index fb677ee1b16..dc6827ede9f 100644
--- a/src/mesa/swrast/s_drawpix.c
+++ b/src/mesa/swrast/s_drawpix.c
@@ -264,7 +264,7 @@ draw_stencil_pixels( struct gl_context *ctx, GLint x, GLint y,
                      const struct gl_pixelstore_attrib *unpack,
                      const GLvoid *pixels )
 {
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    const GLenum destType = GL_UNSIGNED_BYTE;
    GLint row;
    GLubyte *values;
@@ -309,8 +309,8 @@ draw_depth_pixels( struct gl_context *ctx, GLint x, GLint y,
                    const GLvoid *pixels )
 {
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+      = ctx->Pixel.DepthScale != 1.0f || ctx->Pixel.DepthBias != 0.0f;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0f || ctx->Pixel.ZoomY != 1.0f;
    SWspan span;
 
    INIT_SPAN(span, GL_BITMAP);
@@ -415,7 +415,7 @@ draw_rgba_pixels( struct gl_context *ctx, GLint x, GLint y,
                   const GLvoid *pixels )
 {
    const GLint imgX = x, imgY = y;
-   const GLboolean zoom = ctx->Pixel.ZoomX!=1.0 || ctx->Pixel.ZoomY!=1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    GLbitfield transferOps = ctx->_ImageTransferState;
    SWspan span;
 
@@ -601,10 +601,10 @@ draw_depth_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
 {
    const GLint imgX = x, imgY = y;
    const GLboolean scaleOrBias
-      = ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0;
+      = ctx->Pixel.DepthScale != 1.0F || ctx->Pixel.DepthBias != 0.0F;
    const GLuint stencilMask = ctx->Stencil.WriteMask[0];
    const GLenum stencilType = GL_UNSIGNED_BYTE;
-   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0 || ctx->Pixel.ZoomY != 1.0;
+   const GLboolean zoom = ctx->Pixel.ZoomX != 1.0F || ctx->Pixel.ZoomY != 1.0F;
    struct gl_renderbuffer *depthRb, *stencilRb;
    struct gl_pixelstore_attrib clippedUnpack = *unpack;
 
diff --git a/src/mesa/swrast/s_fragprog.c b/src/mesa/swrast/s_fragprog.c
index 175915a5a0b..4fbf66b9db7 100644
--- a/src/mesa/swrast/s_fragprog.c
+++ b/src/mesa/swrast/s_fragprog.c
@@ -243,9 +243,9 @@ run_program(struct gl_context *ctx, SWspan *span, GLuint start, GLuint end)
             /* Store result depth/z */
             if (outputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
                const GLfloat depth = machine->Outputs[FRAG_RESULT_DEPTH][2];
-               if (depth <= 0.0)
+               if (depth <= 0.0F)
                   span->array->z[i] = 0;
-               else if (depth >= 1.0)
+               else if (depth >= 1.0F)
                   span->array->z[i] = ctx->DrawBuffer->_DepthMax;
                else
                   span->array->z[i] =
diff --git a/src/mesa/swrast/s_lines.c b/src/mesa/swrast/s_lines.c
index 58bd2fc720a..ab8da7db289 100644
--- a/src/mesa/swrast/s_lines.c
+++ b/src/mesa/swrast/s_lines.c
@@ -241,7 +241,7 @@ _swrast_choose_line( struct gl_context *ctx )
          USE(general_line);
       }
       else if (ctx->Depth.Test
-               || ctx->Line.Width != 1.0
+               || ctx->Line.Width != 1.0F
                || ctx->Line.StippleFlag) {
          /* no texture, but Z, fog, width>1, stipple, etc. */
 #if CHAN_BITS == 32
@@ -252,7 +252,7 @@ _swrast_choose_line( struct gl_context *ctx )
       }
       else {
          assert(!ctx->Depth.Test);
-         assert(ctx->Line.Width == 1.0);
+         assert(ctx->Line.Width == 1.0F);
          /* simple lines */
          USE(simple_no_z_rgba_line);
       }
diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c
index 2212c95fa9a..d9aae73302c 100644
--- a/src/mesa/swrast/s_points.c
+++ b/src/mesa/swrast/s_points.c
@@ -208,9 +208,9 @@ sprite_point(struct gl_context *ctx, const SWvertex *vert)
       else {
          /* even size */
          /* 0.501 factor allows conformance to pass */
-         xmin = (GLint) (x + 0.501) - iRadius;
+         xmin = (GLint) (x + 0.501F) - iRadius;
          xmax = xmin + iSize - 1;
-         ymin = (GLint) (y + 0.501) - iRadius;
+         ymin = (GLint) (y + 0.501F) - iRadius;
          ymax = ymin + iSize - 1;
       }
 
@@ -423,9 +423,9 @@ large_point(struct gl_context *ctx, const SWvertex *vert)
       else {
          /* even size */
          /* 0.501 factor allows conformance to pass */
-         xmin = (GLint) (x + 0.501) - iRadius;
+         xmin = (GLint) (x + 0.501F) - iRadius;
          xmax = xmin + iSize - 1;
-         ymin = (GLint) (y + 0.501) - iRadius;
+         ymin = (GLint) (y + 0.501F) - iRadius;
          ymax = ymin + iSize - 1;
       }
 
@@ -552,7 +552,7 @@ _swrast_choose_point(struct gl_context *ctx)
       else if (ctx->Point.SmoothFlag) {
          swrast->Point = smooth_point;
       }
-      else if (size > 1.0 ||
+      else if (size > 1.0F ||
                ctx->Point._Attenuated ||
                ctx->VertexProgram.PointSizeEnabled) {
          swrast->Point = large_point;
diff --git a/src/mesa/swrast/s_span.c b/src/mesa/swrast/s_span.c
index 3db10e163d7..cd939ba9510 100644
--- a/src/mesa/swrast/s_span.c
+++ b/src/mesa/swrast/s_span.c
@@ -506,7 +506,7 @@ interpolate_texcoords(struct gl_context *ctx, SWspan *span)
             /* LOD is calculated directly in the ansiotropic filter, we can
              * skip the normal lambda function as the result is ignored.
              */
-            if (samp->MaxAnisotropy > 1.0 &&
+            if (samp->MaxAnisotropy > 1.0F &&
                 samp->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
                needLambda = GL_FALSE;
             }
@@ -886,16 +886,16 @@ apply_aa_coverage(SWspan *span)
       GLubyte (*rgba)[4] = span->array->rgba8;
       for (i = 0; i < span->end; i++) {
          const GLfloat a = rgba[i][ACOMP] * coverage[i];
-         rgba[i][ACOMP] = (GLubyte) CLAMP(a, 0.0, 255.0);
-         assert(coverage[i] >= 0.0);
-         assert(coverage[i] <= 1.0);
+         rgba[i][ACOMP] = (GLubyte) CLAMP(a, 0.0F, 255.0F);
+         assert(coverage[i] >= 0.0F);
+         assert(coverage[i] <= 1.0F);
       }
    }
    else if (span->array->ChanType == GL_UNSIGNED_SHORT) {
       GLushort (*rgba)[4] = span->array->rgba16;
       for (i = 0; i < span->end; i++) {
          const GLfloat a = rgba[i][ACOMP] * coverage[i];
-         rgba[i][ACOMP] = (GLushort) CLAMP(a, 0.0, 65535.0);
+         rgba[i][ACOMP] = (GLushort) CLAMP(a, 0.0F, 65535.0F);
       }
    }
    else {
diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
index 453bd36367b..da4a013634c 100644
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -670,8 +670,8 @@ _swrast_texture_span( struct gl_context *ctx, SWspan *span )
                }
             }
 
-            if (samp->MinLod != -1000.0 ||
-                samp->MaxLod != 1000.0) {
+            if (samp->MinLod != -1000.0F ||
+                samp->MaxLod != 1000.0F) {
                /* apply LOD clamping to lambda */
                const GLfloat min = samp->MinLod;
                const GLfloat max = samp->MaxLod;
@@ -682,7 +682,7 @@ _swrast_texture_span( struct gl_context *ctx, SWspan *span )
                }
             }
          }
-         else if (samp->MaxAnisotropy > 1.0 &&
+         else if (samp->MaxAnisotropy > 1.0F &&
                   samp->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
             /* sample_lambda_2d_aniso is beeing used as texture_sample_func,
              * it requires the current SWspan *span as an additional parameter.
diff --git a/src/mesa/swrast/s_texfilter.c b/src/mesa/swrast/s_texfilter.c
index abc1727cf29..314170fc751 100644
--- a/src/mesa/swrast/s_texfilter.c
+++ b/src/mesa/swrast/s_texfilter.c
@@ -1902,7 +1902,7 @@ sample_lambda_2d_aniso(struct gl_context *ctx,
    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
    const GLboolean adjustLOD =
       (texUnit->LodBias + samp->LodBias != 0.0F)
-      || (samp->MinLod != -1000.0 || samp->MaxLod != 1000.0);
+      || (samp->MinLod != -1000.0F || samp->MaxLod != 1000.0F);
 
    GLuint i;
    
@@ -1973,8 +1973,8 @@ sample_lambda_2d_aniso(struct gl_context *ctx,
                      ctx->Const.MaxTextureLodBias);
             lod += bias;
 
-            if (samp->MinLod != -1000.0 ||
-                samp->MaxLod != 1000.0) {
+            if (samp->MinLod != -1000.0F ||
+                samp->MaxLod != 1000.0F) {
                /* apply LOD clamping to lambda */
                lod = CLAMP(lod, samp->MinLod, samp->MaxLod);
             }
@@ -3713,7 +3713,7 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
                                     const struct gl_sampler_object *sampler)
 {
    if (!t || !_mesa_is_texture_complete(t, sampler)) {
-      return &null_sample_func;
+      return null_sample_func;
    }
    else {
       const GLboolean needLambda =
@@ -3722,32 +3722,32 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
       switch (t->Target) {
       case GL_TEXTURE_1D:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
-            return &sample_lambda_1d;
+            return sample_lambda_1d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_1d;
+            return sample_linear_1d;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_1d;
+            return sample_nearest_1d;
          }
       case GL_TEXTURE_2D:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
             /* Anisotropic filtering extension. Activated only if mipmaps are used */
-            if (sampler->MaxAnisotropy > 1.0 &&
+            if (sampler->MaxAnisotropy > 1.0F &&
                 sampler->MinFilter == GL_LINEAR_MIPMAP_LINEAR) {
-               return &sample_lambda_2d_aniso;
+               return sample_lambda_2d_aniso;
             }
-            return &sample_lambda_2d;
+            return sample_lambda_2d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_2d;
+            return sample_linear_2d;
          }
          else {
             /* check for a few optimized cases */
@@ -3772,72 +3772,72 @@ _swrast_choose_texture_sample_func( struct gl_context *ctx,
          }
       case GL_TEXTURE_3D:
          if (needLambda) {
-            return &sample_lambda_3d;
+            return sample_lambda_3d;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_3d;
+            return sample_linear_3d;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_3d;
+            return sample_nearest_3d;
          }
       case GL_TEXTURE_CUBE_MAP:
          if (needLambda) {
-            return &sample_lambda_cube;
+            return sample_lambda_cube;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_cube;
+            return sample_linear_cube;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_cube;
+            return sample_nearest_cube;
          }
       case GL_TEXTURE_RECTANGLE_NV:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
          else if (needLambda) {
-            return &sample_lambda_rect;
+            return sample_lambda_rect;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_rect;
+            return sample_linear_rect;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_rect;
+            return sample_nearest_rect;
          }
       case GL_TEXTURE_1D_ARRAY_EXT:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
 	 else if (needLambda) {
-            return &sample_lambda_1d_array;
+            return sample_lambda_1d_array;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_1d_array;
+            return sample_linear_1d_array;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_1d_array;
+            return sample_nearest_1d_array;
          }
       case GL_TEXTURE_2D_ARRAY_EXT:
          if (is_depth_texture(t)) {
-            return &sample_depth_texture;
+            return sample_depth_texture;
          }
 	 else if (needLambda) {
-            return &sample_lambda_2d_array;
+            return sample_lambda_2d_array;
          }
          else if (sampler->MinFilter == GL_LINEAR) {
-            return &sample_linear_2d_array;
+            return sample_linear_2d_array;
          }
          else {
             assert(sampler->MinFilter == GL_NEAREST);
-            return &sample_nearest_2d_array;
+            return sample_nearest_2d_array;
          }
       default:
          _mesa_problem(ctx,
                        "invalid target in _swrast_choose_texture_sample_func");
-         return &null_sample_func;
+         return null_sample_func;
       }
    }
 }
diff --git a/src/mesa/swrast/s_tritemp.h b/src/mesa/swrast/s_tritemp.h
index fddbbfd99d6..1d71839713c 100644
--- a/src/mesa/swrast/s_tritemp.h
+++ b/src/mesa/swrast/s_tritemp.h
@@ -242,7 +242,7 @@ static void NAME(struct gl_context *ctx, const SWvertex *v0,
       if (IS_INF_OR_NAN(area) || area == 0.0F)
          return;
 
-      if (area * bf * swrast->_BackfaceCullSign < 0.0)
+      if (area * bf * swrast->_BackfaceCullSign < 0.0F)
          return;
 
       oneOverArea = 1.0F / area;
diff --git a/src/mesa/swrast/s_zoom.c b/src/mesa/swrast/s_zoom.c
index 9879e2a5f10..34b8eb19657 100644
--- a/src/mesa/swrast/s_zoom.c
+++ b/src/mesa/swrast/s_zoom.c
@@ -114,7 +114,7 @@ unzoom_x(GLfloat zoomX, GLint imageX, GLint zx)
    (zx - imageX) / zoomX = x - imageX;
    */
    GLint x;
-   if (zoomX < 0.0)
+   if (zoomX < 0.0F)
       zx++;
    x = imageX + (GLint) ((zx - imageX) / zoomX);
    return x;
diff --git a/src/mesa/swrast_setup/ss_tritmp.h b/src/mesa/swrast_setup/ss_tritmp.h
index c38c76a4adb..adb77bd3247 100644
--- a/src/mesa/swrast_setup/ss_tritmp.h
+++ b/src/mesa/swrast_setup/ss_tritmp.h
@@ -58,7 +58,7 @@ static void TAG(triangle)(struct gl_context *ctx, GLuint e0, GLuint e1, GLuint e
 
       if (IND & (SS_TWOSIDE_BIT | SS_UNFILLED_BIT))
       {
-	 facing = (cc < 0.0) ^ ctx->Polygon._FrontBit;
+	 facing = (cc < 0.0F) ^ ctx->Polygon._FrontBit;
 
 	 if (IND & SS_UNFILLED_BIT)
 	    mode = facing ? ctx->Polygon.BackMode : ctx->Polygon.FrontMode;
@@ -138,7 +138,7 @@ static void TAG(triangle)(struct gl_context *ctx, GLuint e0, GLuint e1, GLuint e
           * so no MRD value is used here.
           */
 	 offset = ctx->Polygon.OffsetUnits;
-	 if (cc * cc > 1e-16) {
+	 if (cc * cc > 1e-16F) {
 	    const GLfloat ez = z[0] - z[2];
 	    const GLfloat fz = z[1] - z[2];
 	    const GLfloat oneOverArea = 1.0F / cc;
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index bc77ba8bf95..b5c0b3e1f5b 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -190,7 +190,7 @@ _tnl_InvalidateState( struct gl_context *ctx, GLuint new_state )
    }
 
    if (new_state & (_NEW_VIEWPORT | _NEW_BUFFERS)) {
-      double scale[3], translate[3];
+      float scale[3], translate[3];
       _mesa_get_viewport_xform(ctx, 0, scale, translate);
       _math_matrix_viewport(&tnl->_WindowMap, scale, translate,
                             ctx->DrawBuffer->_DepthMaxF);
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index 6adf1dce676..c130ab3f93d 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -257,7 +257,7 @@ static GLboolean *_tnl_import_edgeflag( struct gl_context *ctx,
    GLuint i;
 
    for (i = 0; i < count; i++) {
-      *bptr++ = ((GLfloat *)ptr)[0] == 1.0;
+      *bptr++ = ((GLfloat *)ptr)[0] == 1.0F;
       ptr += stride;
    }
 
@@ -425,6 +425,7 @@ void _tnl_draw_prims(struct gl_context *ctx,
 			 GLuint min_index,
 			 GLuint max_index,
 			 struct gl_transform_feedback_object *tfb_vertcount,
+                         unsigned stream,
 			 struct gl_buffer_object *indirect)
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
@@ -451,7 +452,7 @@ void _tnl_draw_prims(struct gl_context *ctx,
       printf("%s %d..%d\n", __func__, min_index, max_index);
       for (i = 0; i < nr_prims; i++)
 	 printf("prim %d: %s start %d count %d\n", i, 
-		_mesa_lookup_enum_by_nr(prim[i].mode),
+		_mesa_enum_to_string(prim[i].mode),
 		prim[i].start,
 		prim[i].count);
    }
diff --git a/src/mesa/tnl/t_rasterpos.c b/src/mesa/tnl/t_rasterpos.c
index d4b45bac9ac..4bd9ac8539e 100644
--- a/src/mesa/tnl/t_rasterpos.c
+++ b/src/mesa/tnl/t_rasterpos.c
@@ -148,7 +148,7 @@ shade_rastpos(struct gl_context *ctx,
 	 SUB_3V(VP, light->_Position, vertex);
          /* d = length(VP) */
 	 d = (GLfloat) LEN_3FV( VP );
-	 if (d > 1.0e-6) {
+	 if (d > 1.0e-6F) {
             /* normalize VP */
 	    GLfloat invd = 1.0F / d;
 	    SELF_SCALE_SCALAR_3V(VP, invd);
@@ -172,7 +172,7 @@ shade_rastpos(struct gl_context *ctx,
 	 }
       }
 
-      if (attenuation < 1e-3)
+      if (attenuation < 1e-3F)
 	 continue;
 
       n_dot_VP = DOT3( normal, VP );
@@ -219,7 +219,7 @@ shade_rastpos(struct gl_context *ctx,
 	    shine = ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_SHININESS][0];
 	    spec_coef = powf(n_dot_h, shine);
 
-	    if (spec_coef > 1.0e-10) {
+	    if (spec_coef > 1.0e-10F) {
                if (ctx->Light.Model.ColorControl==GL_SEPARATE_SPECULAR_COLOR) {
                   ACC_SCALE_SCALAR_3V( specularContrib, spec_coef,
                                        light->_MatSpecular[0]);
@@ -378,7 +378,7 @@ _tnl_RasterPos(struct gl_context *ctx, const GLfloat vObj[4])
       GLfloat eye[4], clip[4], ndc[3], d;
       GLfloat *norm, eyenorm[3];
       GLfloat *objnorm = ctx->Current.Attrib[VERT_ATTRIB_NORMAL];
-      double scale[3], translate[3];
+      float scale[3], translate[3];
 
       /* apply modelview matrix:  eye = MV * obj */
       TRANSFORM_POINT( eye, ctx->ModelviewMatrixStack.Top->m, vObj );
diff --git a/src/mesa/tnl/t_vb_fog.c b/src/mesa/tnl/t_vb_fog.c
index 1ca72f866b7..5489ed6857f 100644
--- a/src/mesa/tnl/t_vb_fog.c
+++ b/src/mesa/tnl/t_vb_fog.c
@@ -45,8 +45,8 @@ struct fog_stage_data {
 #define FOG_STAGE_DATA(stage) ((struct fog_stage_data *)stage->privatePtr)
 
 #define FOG_EXP_TABLE_SIZE 256
-#define FOG_MAX (10.0)
-#define EXP_FOG_MAX .0006595
+#define FOG_MAX (10.0F)
+#define EXP_FOG_MAX .0006595F
 #define FOG_INCR (FOG_MAX/FOG_EXP_TABLE_SIZE)
 static GLfloat exp_table[FOG_EXP_TABLE_SIZE];
 static GLfloat inited = 0;
@@ -54,7 +54,7 @@ static GLfloat inited = 0;
 #if 1
 #define NEG_EXP( result, narg )						\
 do {									\
-   GLfloat f = (GLfloat) (narg * (1.0/FOG_INCR));			\
+   GLfloat f = (GLfloat) (narg * (1.0F / FOG_INCR));			\
    GLint k = (GLint) f;							\
    if (k > FOG_EXP_TABLE_SIZE-2) 					\
       result = (GLfloat) EXP_FOG_MAX;					\
diff --git a/src/mesa/tnl/t_vb_light.c b/src/mesa/tnl/t_vb_light.c
index dbd57fa6bfe..029265a4f83 100644
--- a/src/mesa/tnl/t_vb_light.c
+++ b/src/mesa/tnl/t_vb_light.c
@@ -137,23 +137,23 @@ validate_shine_table( struct gl_context *ctx, GLuint side, GLfloat shininess )
 	    break;
 
       m = s->tab;
-      m[0] = 0.0;
-      if (shininess == 0.0) {
+      m[0] = 0.0F;
+      if (shininess == 0.0F) {
 	 for (j = 1 ; j <= SHINE_TABLE_SIZE ; j++)
-	    m[j] = 1.0;
+	    m[j] = 1.0F;
       }
       else {
 	 for (j = 1 ; j < SHINE_TABLE_SIZE ; j++) {
-            GLdouble t, x = j / (GLfloat) (SHINE_TABLE_SIZE - 1);
-            if (x < 0.005) /* underflow check */
-               x = 0.005;
-            t = pow(x, shininess);
-	    if (t > 1e-20)
-	       m[j] = (GLfloat) t;
+            GLfloat t, x = j / (GLfloat) (SHINE_TABLE_SIZE - 1);
+            if (x < 0.005F) /* underflow check */
+               x = 0.005F;
+            t = powf(x, shininess);
+	    if (t > 1e-20F)
+	       m[j] = t;
 	    else
-	       m[j] = 0.0;
+	       m[j] = 0.0F;
 	 }
-	 m[SHINE_TABLE_SIZE] = 1.0;
+	 m[SHINE_TABLE_SIZE] = 1.0F;
       }
 
       s->shininess = shininess;
diff --git a/src/mesa/tnl/t_vb_lighttmp.h b/src/mesa/tnl/t_vb_lighttmp.h
index f8786accbbb..3aebcd4b799 100644
--- a/src/mesa/tnl/t_vb_lighttmp.h
+++ b/src/mesa/tnl/t_vb_lighttmp.h
@@ -112,7 +112,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 	 GLint side;
 	 GLfloat contrib[3];
 	 GLfloat attenuation;
-	 GLfloat VP[3];  /* unit vector from vertex to light */
+	 GLfloat VP[3];          /* unit vector from vertex to light */
 	 GLfloat n_dot_VP;       /* n dot VP */
 	 GLfloat *h;
 
@@ -129,7 +129,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 
 	    d = (GLfloat) LEN_3FV( VP );
 
-	    if (d > 1e-6) {
+	    if (d > 1e-6F) {
 	       GLfloat invd = 1.0F / d;
 	       SELF_SCALE_SCALAR_3V(VP, invd);
 	    }
@@ -152,7 +152,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 	    }
 	 }
 
-	 if (attenuation < 1e-3)
+	 if (attenuation < 1e-3F)
 	    continue;		/* this light makes no contribution */
 
 	 /* Compute dot product or normal and vector from V to light pos */
@@ -204,7 +204,7 @@ static void TAG(light_rgba_spec)( struct gl_context *ctx,
 
 	 if (n_dot_h > 0.0F) {
 	    GLfloat spec_coef = lookup_shininess(ctx, side, n_dot_h);
-	    if (spec_coef > 1.0e-10) {
+	    if (spec_coef > 1.0e-10F) {
 	       spec_coef *= attenuation;
 	       ACC_SCALE_SCALAR_3V( spec[side], spec_coef,
 				    light->_MatSpecular[side]);
@@ -283,12 +283,11 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 
       /* Add contribution from each enabled light source */
       foreach (light, &ctx->Light.EnabledList) {
-
 	 GLfloat n_dot_h;
 	 GLfloat correction;
 	 GLint side;
 	 GLfloat contrib[3];
-	 GLfloat attenuation = 1.0;
+	 GLfloat attenuation;
 	 GLfloat VP[3];          /* unit vector from vertex to light */
 	 GLfloat n_dot_VP;       /* n dot VP */
 	 GLfloat *h;
@@ -302,12 +301,11 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 	 else {
 	    GLfloat d;     /* distance from vertex to light */
 
-
 	    SUB_3V(VP, light->_Position, vertex);
 
 	    d = (GLfloat) LEN_3FV( VP );
 
-	    if ( d > 1e-6) {
+	    if (d > 1e-6F) {
 	       GLfloat invd = 1.0F / d;
 	       SELF_SCALE_SCALAR_3V(VP, invd);
 	    }
@@ -330,7 +328,7 @@ static void TAG(light_rgba)( struct gl_context *ctx,
 	    }
 	 }
 
-	 if (attenuation < 1e-3)
+	 if (attenuation < 1e-3F)
 	    continue;		/* this light makes no contribution */
 
 	 /* Compute dot product or normal and vector from V to light pos */
diff --git a/src/mesa/tnl/t_vb_normals.c b/src/mesa/tnl/t_vb_normals.c
index 9aee1a2fb0b..6fc89c23b33 100644
--- a/src/mesa/tnl/t_vb_normals.c
+++ b/src/mesa/tnl/t_vb_normals.c
@@ -114,7 +114,7 @@ validate_normal_stage(struct gl_context *ctx, struct tnl_pipeline_stage *stage)
 	 store->NormalTransform = _mesa_normal_tab[transform | NORM_NORMALIZE];
       }
       else if (ctx->Transform.RescaleNormals &&
-               ctx->_ModelViewInvScale != 1.0) {
+               ctx->_ModelViewInvScale != 1.0F) {
 	 store->NormalTransform = _mesa_normal_tab[transform | NORM_RESCALE];
       }
       else {
@@ -131,7 +131,7 @@ validate_normal_stage(struct gl_context *ctx, struct tnl_pipeline_stage *stage)
 	 store->NormalTransform = _mesa_normal_tab[NORM_NORMALIZE];
       }
       else if (!ctx->Transform.RescaleNormals &&
-	       ctx->_ModelViewInvScale != 1.0) {
+	       ctx->_ModelViewInvScale != 1.0F) {
 	 store->NormalTransform = _mesa_normal_tab[NORM_RESCALE];
       }
       else {
diff --git a/src/mesa/tnl/t_vb_render.c b/src/mesa/tnl/t_vb_render.c
index 4960ac0969e..03e8fcfa196 100644
--- a/src/mesa/tnl/t_vb_render.c
+++ b/src/mesa/tnl/t_vb_render.c
@@ -315,7 +315,7 @@ static GLboolean run_render( struct gl_context *ctx,
 
 	 if (MESA_VERBOSE & VERBOSE_PRIMS) 
 	    _mesa_debug(NULL, "MESA prim %s %d..%d\n", 
-			_mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
+			_mesa_enum_to_string(prim & PRIM_MODE_MASK), 
 			start, start+length);
 
 	 if (length)
diff --git a/src/mesa/tnl/t_vertex_generic.c b/src/mesa/tnl/t_vertex_generic.c
index 2a25a96928f..6c40c868363 100644
--- a/src/mesa/tnl/t_vertex_generic.c
+++ b/src/mesa/tnl/t_vertex_generic.c
@@ -1026,7 +1026,7 @@ void _tnl_generic_interp( struct gl_context *ctx,
 
    if (tnl->NeedNdcCoords) {
       const GLfloat *dstclip = VB->ClipPtr->data[edst];
-      if (dstclip[3] != 0.0) {
+      if (dstclip[3] != 0.0f) {
 	 const GLfloat w = 1.0f / dstclip[3];
 	 GLfloat pos[4];
 
diff --git a/src/mesa/tnl/t_vertex_sse.c b/src/mesa/tnl/t_vertex_sse.c
index 30dc1a72080..14e7812ec78 100644
--- a/src/mesa/tnl/t_vertex_sse.c
+++ b/src/mesa/tnl/t_vertex_sse.c
@@ -592,7 +592,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )
 	    break;
 	 case GL_UNSIGNED_SHORT:
 	 default:
-	    printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
+	    printf("unknown CHAN_TYPE %s\n", _mesa_enum_to_string(CHAN_TYPE));
 	    return GL_FALSE;
 	 }
 	 break;
diff --git a/src/mesa/tnl/tnl.h b/src/mesa/tnl/tnl.h
index 8c59ff9e58f..5a9938e7afb 100644
--- a/src/mesa/tnl/tnl.h
+++ b/src/mesa/tnl/tnl.h
@@ -76,7 +76,7 @@ struct _mesa_prim;
 struct _mesa_index_buffer;
 
 void
-_tnl_draw_prims( struct gl_context *ctx,
+_tnl_draw_prims(struct gl_context *ctx,
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
@@ -84,6 +84,7 @@ _tnl_draw_prims( struct gl_context *ctx,
 		     GLuint min_index,
 		     GLuint max_index,
 		     struct gl_transform_feedback_object *tfb_vertcount,
+                     unsigned stream,
 		     struct gl_buffer_object *indirect );
 
 extern void
diff --git a/src/mesa/tnl_dd/t_dd_dmatmp.h b/src/mesa/tnl_dd/t_dd_dmatmp.h
index 667e2a6e5d5..7be39541e43 100644
--- a/src/mesa/tnl_dd/t_dd_dmatmp.h
+++ b/src/mesa/tnl_dd/t_dd_dmatmp.h
@@ -1256,7 +1256,7 @@ static GLboolean TAG(validate_render)( struct gl_context *ctx,
       }
       
       if (!ok) {
-/* 	 fprintf(stderr, "not ok %s\n", _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK)); */
+/* 	 fprintf(stderr, "not ok %s\n", _mesa_enum_to_string(prim & PRIM_MODE_MASK)); */
 	 return GL_FALSE;
       }
    }
diff --git a/src/mesa/tnl_dd/t_dd_unfilled.h b/src/mesa/tnl_dd/t_dd_unfilled.h
index 82190c08916..ee15e773c88 100644
--- a/src/mesa/tnl_dd/t_dd_unfilled.h
+++ b/src/mesa/tnl_dd/t_dd_unfilled.h
@@ -60,7 +60,7 @@ static void TAG(unfilled_tri)( struct gl_context *ctx,
    }
 
 /*     fprintf(stderr, "%s %s %d %d %d\n", __func__, */
-/*  	   _mesa_lookup_enum_by_nr( mode ), */
+/*  	   _mesa_enum_to_string( mode ), */
 /*  	   ef[e0], ef[e1], ef[e2]); */
 
    if (mode == GL_POINT) {
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index 54dee6c464f..2aaff5df019 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -97,7 +97,8 @@ typedef void (*vbo_draw_func)( struct gl_context *ctx,
 			       GLuint min_index,
 			       GLuint max_index,
 			       struct gl_transform_feedback_object *tfb_vertcount,
-			       struct gl_buffer_object *indirect );
+                               unsigned stream,
+			       struct gl_buffer_object *indirect);
 
 
 
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index fd1ffe2f76d..e3eb286e482 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -37,9 +37,9 @@
 
 static GLuint check_size( const GLfloat *attr )
 {
-   if (attr[3] != 1.0) return 4;
-   if (attr[2] != 0.0) return 3;
-   if (attr[1] != 0.0) return 2;
+   if (attr[3] != 1.0F) return 4;
+   if (attr[2] != 0.0F) return 3;
+   if (attr[1] != 0.0F) return 2;
    return 1;		
 }
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 72b8206ec23..34d2c1d3d6b 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -255,7 +255,7 @@ check_array_data(struct gl_context *ctx, struct gl_client_array *array,
             GLint k;
             for (k = 0; k < array->Size; k++) {
                if (IS_INF_OR_NAN(f[k]) ||
-                   f[k] >= 1.0e20 || f[k] <= -1.0e10) {
+                   f[k] >= 1.0e20F || f[k] <= -1.0e10F) {
                   printf("Bad array data:\n");
                   printf("  Element[%u].%u = %f\n", j, k, f[k]);
                   printf("  Array %u at %p\n", attrib, (void* ) array);
@@ -263,7 +263,7 @@ check_array_data(struct gl_context *ctx, struct gl_client_array *array,
 			 array->Type, array->Size, array->Stride);
                   printf("  Address/offset %p in Buffer Object %u\n",
 			 array->Ptr, array->BufferObj->Name);
-                  f[k] = 1.0; /* XXX replace the bad value! */
+                  f[k] = 1.0F; /* XXX replace the bad value! */
                }
                /*assert(!IS_INF_OR_NAN(f[k]));*/
             }
@@ -633,7 +633,7 @@ vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
          /* draw one or two prims */
          check_buffers_are_unmapped(exec->array.inputs);
          vbo->draw_prims(ctx, prim, primCount, NULL,
-                         GL_TRUE, start, start + count - 1, NULL, NULL);
+                         GL_TRUE, start, start + count - 1, NULL, 0, NULL);
       }
    }
    else {
@@ -644,7 +644,7 @@ vbo_draw_arrays(struct gl_context *ctx, GLenum mode, GLint start,
       check_buffers_are_unmapped(exec->array.inputs);
       vbo->draw_prims(ctx, prim, 1, NULL,
                       GL_TRUE, start, start + count - 1,
-                      NULL, NULL);
+                      NULL, 0, NULL);
    }
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
@@ -786,7 +786,7 @@ vbo_exec_DrawArrays(GLenum mode, GLint start, GLsizei count)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArrays(%s, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, count);
+                  _mesa_enum_to_string(mode), start, count);
 
    if (!_mesa_validate_DrawArrays(ctx, mode, count))
       return;
@@ -813,7 +813,7 @@ vbo_exec_DrawArraysInstanced(GLenum mode, GLint start, GLsizei count,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysInstanced(%s, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, count, numInstances);
+                  _mesa_enum_to_string(mode), start, count, numInstances);
 
    if (!_mesa_validate_DrawArraysInstanced(ctx, mode, start, count, numInstances))
       return;
@@ -839,7 +839,7 @@ vbo_exec_DrawArraysInstancedBaseInstance(GLenum mode, GLint first, GLsizei count
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysInstancedBaseInstance(%s, %d, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), first, count,
+                  _mesa_enum_to_string(mode), first, count,
                   numInstances, baseInstance);
 
    if (!_mesa_validate_DrawArraysInstanced(ctx, mode, first, count,
@@ -990,7 +990,7 @@ vbo_validated_drawrangeelements(struct gl_context *ctx, GLenum mode,
 
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1, &ib,
-                   index_bounds_valid, start, end, NULL, NULL);
+                   index_bounds_valid, start, end, NULL, 0, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
       _mesa_flush(ctx);
@@ -1021,8 +1021,8 @@ vbo_exec_DrawRangeElementsBaseVertex(GLenum mode,
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx,
                 "glDrawRangeElementsBaseVertex(%s, %u, %u, %d, %s, %p, %d)\n",
-                _mesa_lookup_enum_by_nr(mode), start, end, count,
-                _mesa_lookup_enum_by_nr(type), indices, basevertex);
+                _mesa_enum_to_string(mode), start, end, count,
+                _mesa_enum_to_string(type), indices, basevertex);
 
    if (!_mesa_validate_DrawRangeElements(ctx, mode, start, end, count,
                                          type, indices))
@@ -1099,8 +1099,8 @@ vbo_exec_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
       GET_CURRENT_CONTEXT(ctx);
       _mesa_debug(ctx,
                   "glDrawRangeElements(%s, %u, %u, %d, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), start, end, count,
-                  _mesa_lookup_enum_by_nr(type), indices);
+                  _mesa_enum_to_string(mode), start, end, count,
+                  _mesa_enum_to_string(type), indices);
    }
 
    vbo_exec_DrawRangeElementsBaseVertex(mode, start, end, count, type,
@@ -1119,8 +1119,8 @@ vbo_exec_DrawElements(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElements(%s, %u, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices);
 
    if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
       return;
@@ -1141,8 +1141,8 @@ vbo_exec_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsBaseVertex(%s, %d, %s, %p, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices, basevertex);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices, basevertex);
 
    if (!_mesa_validate_DrawElements(ctx, mode, count, type, indices))
       return;
@@ -1163,8 +1163,8 @@ vbo_exec_DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstanced(%s, %d, %s, %p, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices, numInstances);
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices, numInstances);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
                                              numInstances))
@@ -1187,8 +1187,8 @@ vbo_exec_DrawElementsInstancedBaseVertex(GLenum mode, GLsizei count, GLenum type
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseVertex(%s, %d, %s, %p, %d; %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, basevertex);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1212,8 +1212,8 @@ vbo_exec_DrawElementsInstancedBaseInstance(GLenum mode, GLsizei count, GLenum ty
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseInstance(%s, %d, %s, %p, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, baseInstance);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1238,8 +1238,8 @@ vbo_exec_DrawElementsInstancedBaseVertexBaseInstance(GLenum mode, GLsizei count,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsInstancedBaseVertexBaseInstance(%s, %d, %s, %p, %d, %d, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), count,
-                  _mesa_lookup_enum_by_nr(type), indices,
+                  _mesa_enum_to_string(mode), count,
+                  _mesa_enum_to_string(type), indices,
                   numInstances, basevertex, baseInstance);
 
    if (!_mesa_validate_DrawElementsInstanced(ctx, mode, count, type, indices,
@@ -1350,7 +1350,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
 
       check_buffers_are_unmapped(exec->array.inputs);
       vbo->draw_prims(ctx, prim, primcount, &ib,
-                      false, ~0, ~0, NULL, NULL);
+                      false, ~0, ~0, NULL, 0, NULL);
    } else {
       /* render one prim at a time */
       for (i = 0; i < primcount; i++) {
@@ -1379,7 +1379,7 @@ vbo_validated_multidrawelements(struct gl_context *ctx, GLenum mode,
 
          check_buffers_are_unmapped(exec->array.inputs);
          vbo->draw_prims(ctx, prim, 1, &ib,
-                         false, ~0, ~0, NULL, NULL);
+                         false, ~0, ~0, NULL, 0, NULL);
       }
    }
 
@@ -1464,7 +1464,7 @@ vbo_draw_transform_feedback(struct gl_context *ctx, GLenum mode,
 
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1, NULL,
-                   GL_TRUE, 0, 0, obj, NULL);
+                   GL_TRUE, 0, 0, obj, stream, NULL);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH) {
       _mesa_flush(ctx);
@@ -1488,7 +1488,7 @@ vbo_exec_DrawTransformFeedback(GLenum mode, GLuint name)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedback(%s, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), name);
+                  _mesa_enum_to_string(mode), name);
 
    vbo_draw_transform_feedback(ctx, mode, obj, 0, 1);
 }
@@ -1502,7 +1502,7 @@ vbo_exec_DrawTransformFeedbackStream(GLenum mode, GLuint name, GLuint stream)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackStream(%s, %u, %u)\n",
-                  _mesa_lookup_enum_by_nr(mode), name, stream);
+                  _mesa_enum_to_string(mode), name, stream);
 
    vbo_draw_transform_feedback(ctx, mode, obj, stream, 1);
 }
@@ -1517,7 +1517,7 @@ vbo_exec_DrawTransformFeedbackInstanced(GLenum mode, GLuint name,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackInstanced(%s, %d)\n",
-                  _mesa_lookup_enum_by_nr(mode), name);
+                  _mesa_enum_to_string(mode), name);
 
    vbo_draw_transform_feedback(ctx, mode, obj, 0, primcount);
 }
@@ -1533,7 +1533,7 @@ vbo_exec_DrawTransformFeedbackStreamInstanced(GLenum mode, GLuint name,
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawTransformFeedbackStreamInstanced"
                   "(%s, %u, %u, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode), name, stream, primcount);
+                  _mesa_enum_to_string(mode), name, stream, primcount);
 
    vbo_draw_transform_feedback(ctx, mode, obj, stream, primcount);
 }
@@ -1563,7 +1563,7 @@ vbo_validated_drawarraysindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1,
                    NULL, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
@@ -1603,7 +1603,7 @@ vbo_validated_multidrawarraysindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, primcount,
                    NULL, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    free(prim);
@@ -1640,7 +1640,7 @@ vbo_validated_drawelementsindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, 1,
                    &ib, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    if (MESA_DEBUG_FLAGS & DEBUG_ALWAYS_FLUSH)
@@ -1689,7 +1689,7 @@ vbo_validated_multidrawelementsindirect(struct gl_context *ctx,
    check_buffers_are_unmapped(exec->array.inputs);
    vbo->draw_prims(ctx, prim, primcount,
                    &ib, GL_TRUE, 0, ~0,
-                   NULL,
+                   NULL, 0,
                    ctx->DrawIndirectBuffer);
 
    free(prim);
@@ -1709,7 +1709,7 @@ vbo_exec_DrawArraysIndirect(GLenum mode, const GLvoid *indirect)
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawArraysIndirect(%s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode), indirect);
+                  _mesa_enum_to_string(mode), indirect);
 
    if (!_mesa_validate_DrawArraysIndirect(ctx, mode, indirect))
       return;
@@ -1725,8 +1725,8 @@ vbo_exec_DrawElementsIndirect(GLenum mode, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glDrawElementsIndirect(%s, %s, %p)\n",
-                  _mesa_lookup_enum_by_nr(mode),
-                  _mesa_lookup_enum_by_nr(type), indirect);
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect);
 
    if (!_mesa_validate_DrawElementsIndirect(ctx, mode, type, indirect))
       return;
@@ -1743,7 +1743,7 @@ vbo_exec_MultiDrawArraysIndirect(GLenum mode,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glMultiDrawArraysIndirect(%s, %p, %i, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode), indirect, primcount, stride);
+                  _mesa_enum_to_string(mode), indirect, primcount, stride);
 
    /* If <stride> is zero, the array elements are treated as tightly packed. */
    if (stride == 0)
@@ -1768,8 +1768,8 @@ vbo_exec_MultiDrawElementsIndirect(GLenum mode, GLenum type,
 
    if (MESA_VERBOSE & VERBOSE_DRAW)
       _mesa_debug(ctx, "glMultiDrawElementsIndirect(%s, %s, %p, %i, %i)\n",
-                  _mesa_lookup_enum_by_nr(mode),
-                  _mesa_lookup_enum_by_nr(type), indirect, primcount, stride);
+                  _mesa_enum_to_string(mode),
+                  _mesa_enum_to_string(type), indirect, primcount, stride);
 
    /* If <stride> is zero, the array elements are treated as tightly packed. */
    if (stride == 0)
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 37b53a8309d..2bfb0c32b73 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -412,7 +412,7 @@ vbo_exec_vtx_flush(struct vbo_exec_context *exec, GLboolean keepUnmapped)
 				       GL_TRUE,
 				       0,
 				       exec->vtx.vert_count - 1,
-				       NULL, NULL);
+				       NULL, 0, NULL);
 
 	 /* If using a real VBO, get new storage -- unless asked not to.
           */
diff --git a/src/mesa/vbo/vbo_primitive_restart.c b/src/mesa/vbo/vbo_primitive_restart.c
index dafc4fd2a9a..0662c5cd4ef 100644
--- a/src/mesa/vbo/vbo_primitive_restart.c
+++ b/src/mesa/vbo/vbo_primitive_restart.c
@@ -251,11 +251,11 @@ vbo_sw_primitive_restart(struct gl_context *ctx,
                 (temp_prim.count == sub_prim->count)) {
                draw_prims_func(ctx, &temp_prim, 1, ib,
                                GL_TRUE, sub_prim->min_index, sub_prim->max_index,
-                               NULL, NULL);
+                               NULL, 0, NULL);
             } else {
                draw_prims_func(ctx, &temp_prim, 1, ib,
                                GL_FALSE, -1, -1,
-                               NULL, NULL);
+                               NULL, 0, NULL);
             }
          }
          if (sub_end_index >= end_index) {
diff --git a/src/mesa/vbo/vbo_rebase.c b/src/mesa/vbo/vbo_rebase.c
index c3c4b64e65c..24c04ca7e6a 100644
--- a/src/mesa/vbo/vbo_rebase.c
+++ b/src/mesa/vbo/vbo_rebase.c
@@ -258,7 +258,7 @@ void vbo_rebase_prims( struct gl_context *ctx,
 	 GL_TRUE,
 	 0, 
 	 max_index - min_index,
-	 NULL, NULL );
+	 NULL, 0, NULL );
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index de744e0c763..b1fd6892026 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -314,7 +314,7 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data)
                                       GL_TRUE,
                                       0,    /* Node is a VBO, so this is ok */
                                       node->count - 1,
-                                      NULL, NULL);
+                                      NULL, 0, NULL);
       }
    }
 
diff --git a/src/mesa/vbo/vbo_split_copy.c b/src/mesa/vbo/vbo_split_copy.c
index 7b1e20b18d2..cb27ef961ab 100644
--- a/src/mesa/vbo/vbo_split_copy.c
+++ b/src/mesa/vbo/vbo_split_copy.c
@@ -203,7 +203,7 @@ flush( struct copy_context *copy )
 	       GL_TRUE,
 	       0,
 	       copy->dstbuf_nr - 1,
-	       NULL, NULL );
+	       NULL, 0, NULL );
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;
diff --git a/src/mesa/vbo/vbo_split_inplace.c b/src/mesa/vbo/vbo_split_inplace.c
index 5887b74d829..cff4bcd30ff 100644
--- a/src/mesa/vbo/vbo_split_inplace.c
+++ b/src/mesa/vbo/vbo_split_inplace.c
@@ -94,7 +94,7 @@ static void flush_vertex( struct split_context *split )
 	       !split->ib,
 	       split->min_index,
 	       split->max_index,
-	       NULL, NULL);
+	       NULL, 0, NULL);
 
    ctx->Array._DrawArrays = saved_arrays;
    ctx->NewDriverState |= ctx->DriverFlags.NewArray;
diff --git a/src/util/Makefile.am b/src/util/Makefile.am
index 2e7542e4245..1e087b40d38 100644
--- a/src/util/Makefile.am
+++ b/src/util/Makefile.am
@@ -46,9 +46,9 @@ libmesautil_la_SOURCES = \
 
 if ENABLE_SHADER_CACHE
 libmesautil_la_SOURCES += $(MESA_UTIL_SHADER_CACHE_FILES)
-endif
 
 libmesautil_la_LIBADD = $(SHA1_LIBS)
+endif
 
 roundeven_test_LDADD = -lm
 
diff --git a/src/util/Makefile.sources b/src/util/Makefile.sources
index dc559391823..82df3bcb00a 100644
--- a/src/util/Makefile.sources
+++ b/src/util/Makefile.sources
@@ -19,7 +19,7 @@ MESA_UTIL_FILES :=	\
 	set.c \
 	set.h \
 	simple_list.h \
-	strtod.cpp \
+	strtod.c \
 	strtod.h \
 	texcompress_rgtc_tmp.h \
 	u_atomic.h
diff --git a/src/util/SConscript b/src/util/SConscript
index 9e4d481f838..3dbe70a2e8a 100644
--- a/src/util/SConscript
+++ b/src/util/SConscript
@@ -54,3 +54,10 @@ u_atomic_test = env.Program(
 )
 alias = env.Alias("u_atomic_test", u_atomic_test, u_atomic_test[0].abspath)
 AlwaysBuild(alias)
+
+roundeven_test = env.Program(
+    target = 'roundeven_test',
+    source = ['roundeven_test.c'],
+)
+alias = env.Alias("roundeven_test", roundeven_test, roundeven_test[0].abspath)
+AlwaysBuild(alias)
diff --git a/src/util/macros.h b/src/util/macros.h
index 3b708ed6aa2..84e4f182bcf 100644
--- a/src/util/macros.h
+++ b/src/util/macros.h
@@ -103,6 +103,17 @@ do {                       \
 #define assume(expr) assert(expr)
 #endif
 
+/* Attribute const is used for functions that have no effects other than their
+ * return value, and only rely on the argument values to compute the return
+ * value.  As a result, calls to it can be CSEed.  Note that using memory
+ * pointed to by the arguments is not allowed for const functions.
+ */
+#ifdef HAVE_FUNC_ATTRIBUTE_CONST
+#define ATTRIBUTE_CONST __attribute__((__const__))
+#else
+#define ATTRIBUTE_CONST
+#endif
+
 #ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
 #define FLATTEN __attribute__((__flatten__))
 #else
@@ -130,6 +141,15 @@ do {                       \
 #define PACKED
 #endif
 
+/* Attribute pure is used for functions that have no effects other than their
+ * return value.  As a result, calls to it can be dead code eliminated.
+ */
+#ifdef HAVE_FUNC_ATTRIBUTE_PURE
+#define ATTRIBUTE_PURE __attribute__((__pure__))
+#else
+#define ATTRIBUTE_PURE
+#endif
+
 #ifdef __cplusplus
 /**
  * Macro function that evaluates to true if T is a trivially
@@ -182,6 +202,12 @@ do {                       \
 #define UNUSED
 #endif
 
+#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
+#define MUST_CHECK __attribute__((warn_unused_result))
+#else
+#define MUST_CHECK
+#endif
+
 /** Compute ceiling of integer quotient of A divided by B. */
 #define DIV_ROUND_UP( A, B )  ( (A) % (B) == 0 ? (A)/(B) : (A)/(B)+1 )
 
diff --git a/src/util/register_allocate.c b/src/util/register_allocate.c
index 2ad8c3ce11a..436e008b01a 100644
--- a/src/util/register_allocate.c
+++ b/src/util/register_allocate.c
@@ -321,33 +321,37 @@ ra_set_finalize(struct ra_regs *regs, unsigned int **q_values)
             regs->classes[b]->q[c] = q_values[b][c];
 	 }
       }
-      return;
+   } else {
+      /* Compute, for each class B and C, how many regs of B an
+       * allocation to C could conflict with.
+       */
+      for (b = 0; b < regs->class_count; b++) {
+         for (c = 0; c < regs->class_count; c++) {
+            unsigned int rc;
+            int max_conflicts = 0;
+
+            for (rc = 0; rc < regs->count; rc++) {
+               int conflicts = 0;
+               unsigned int i;
+
+               if (!reg_belongs_to_class(rc, regs->classes[c]))
+                  continue;
+
+               for (i = 0; i < regs->regs[rc].num_conflicts; i++) {
+                  unsigned int rb = regs->regs[rc].conflict_list[i];
+                  if (reg_belongs_to_class(rb, regs->classes[b]))
+                     conflicts++;
+               }
+               max_conflicts = MAX2(max_conflicts, conflicts);
+            }
+            regs->classes[b]->q[c] = max_conflicts;
+         }
+      }
    }
 
-   /* Compute, for each class B and C, how many regs of B an
-    * allocation to C could conflict with.
-    */
-   for (b = 0; b < regs->class_count; b++) {
-      for (c = 0; c < regs->class_count; c++) {
-	 unsigned int rc;
-	 int max_conflicts = 0;
-
-	 for (rc = 0; rc < regs->count; rc++) {
-	    int conflicts = 0;
-	    unsigned int i;
-
-            if (!reg_belongs_to_class(rc, regs->classes[c]))
-	       continue;
-
-	    for (i = 0; i < regs->regs[rc].num_conflicts; i++) {
-	       unsigned int rb = regs->regs[rc].conflict_list[i];
-	       if (reg_belongs_to_class(rb, regs->classes[b]))
-		  conflicts++;
-	    }
-	    max_conflicts = MAX2(max_conflicts, conflicts);
-	 }
-	 regs->classes[b]->q[c] = max_conflicts;
-      }
+   for (b = 0; b < regs->count; b++) {
+      ralloc_free(regs->regs[b].conflict_list);
+      regs->regs[b].conflict_list = NULL;
    }
 }
 
@@ -648,7 +652,7 @@ ra_get_best_spill_node(struct ra_graph *g)
       float cost = g->nodes[n].spill_cost;
       float benefit;
 
-      if (cost <= 0.0)
+      if (cost <= 0.0f)
 	 continue;
 
       if (g->nodes[n].in_stack)
diff --git a/src/util/rounding.h b/src/util/rounding.h
index 0cbe9269f7b..7b5608b8a78 100644
--- a/src/util/rounding.h
+++ b/src/util/rounding.h
@@ -21,7 +21,19 @@
  * IN THE SOFTWARE.
  */
 
+#ifndef _ROUNDING_H
+#define _ROUNDING_H
+
+#include "c99_compat.h" // inline
+
 #include <math.h>
+#include <limits.h>
+#include <stdint.h>
+
+#ifdef __x86_64__
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#endif
 
 #ifdef __SSE4_1__
 #include <smmintrin.h>
@@ -76,3 +88,45 @@ _mesa_roundeven(double x)
    return rint(x);
 #endif
 }
+
+/**
+ * \brief Rounds \c x to the nearest integer, with ties to the even integer,
+ * and returns the value as a long int.
+ */
+static inline long
+_mesa_lroundevenf(float x)
+{
+#ifdef __x86_64__
+#if LONG_MAX == INT64_MAX
+   return _mm_cvtss_si64(_mm_load_ss(&x));
+#elif LONG_MAX == INT32_MAX
+   return _mm_cvtss_si32(_mm_load_ss(&x));
+#else
+#error "Unsupported long size"
+#endif
+#else
+   return lrintf(x);
+#endif
+}
+
+/**
+ * \brief Rounds \c x to the nearest integer, with ties to the even integer,
+ * and returns the value as a long int.
+ */
+static inline long
+_mesa_lroundeven(double x)
+{
+#ifdef __x86_64__
+#if LONG_MAX == INT64_MAX
+   return _mm_cvtsd_si64(_mm_load_sd(&x));
+#elif LONG_MAX == INT32_MAX
+   return _mm_cvtsd_si32(_mm_load_sd(&x));
+#else
+#error "Unsupported long size"
+#endif
+#else
+   return lrint(x);
+#endif
+}
+
+#endif
diff --git a/src/util/strtod.cpp b/src/util/strtod.c
similarity index 87%
rename from src/util/strtod.cpp
rename to src/util/strtod.c
index 2b4dd982a80..ea7d395e2da 100644
--- a/src/util/strtod.cpp
+++ b/src/util/strtod.c
@@ -30,18 +30,28 @@
 #include <locale.h>
 #ifdef HAVE_XLOCALE_H
 #include <xlocale.h>
+static locale_t loc;
 #endif
 #endif
 
 #include "strtod.h"
 
 
+void
+_mesa_locale_init(void)
+{
 #if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
-static struct locale_initializer {
-   locale_initializer() { loc = newlocale(LC_CTYPE_MASK, "C", NULL); }
-   locale_t loc;
-} loc_init;
+   loc = newlocale(LC_CTYPE_MASK, "C", NULL);
 #endif
+}
+
+void
+_mesa_locale_fini(void)
+{
+#if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
+   freelocale(loc);
+#endif
+}
 
 /**
  * Wrapper around strtod which uses the "C" locale so the decimal
@@ -51,7 +61,7 @@ double
 _mesa_strtod(const char *s, char **end)
 {
 #if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
-   return strtod_l(s, end, loc_init.loc);
+   return strtod_l(s, end, loc);
 #else
    return strtod(s, end);
 #endif
@@ -66,7 +76,7 @@ float
 _mesa_strtof(const char *s, char **end)
 {
 #if defined(_GNU_SOURCE) && defined(HAVE_XLOCALE_H)
-   return strtof_l(s, end, loc_init.loc);
+   return strtof_l(s, end, loc);
 #elif defined(HAVE_STRTOF)
    return strtof(s, end);
 #else
diff --git a/src/util/strtod.h b/src/util/strtod.h
index 02c25ddb78f..60e15cfa0eb 100644
--- a/src/util/strtod.h
+++ b/src/util/strtod.h
@@ -31,6 +31,12 @@
 extern "C" {
 #endif
 
+extern void
+_mesa_locale_init(void);
+
+extern void
+_mesa_locale_fini(void);
+
 extern double
 _mesa_strtod(const char *s, char **end);
 
diff --git a/src/vulkan/anv_allocator.c b/src/vulkan/anv_allocator.c
index 0003b3737fc..121ce039250 100644
--- a/src/vulkan/anv_allocator.c
+++ b/src/vulkan/anv_allocator.c
@@ -247,7 +247,7 @@ void
 anv_block_pool_init(struct anv_block_pool *pool,
                     struct anv_device *device, uint32_t block_size)
 {
-   assert(is_power_of_two(block_size));
+   assert(util_is_power_of_two(block_size));
 
    pool->device = device;
    pool->bo.gem_handle = 0;
@@ -388,7 +388,7 @@ anv_fixed_size_state_pool_init(struct anv_fixed_size_state_pool *pool,
                                size_t state_size)
 {
    /* At least a cache line and must divide the block size. */
-   assert(state_size >= 64 && is_power_of_two(state_size));
+   assert(state_size >= 64 && util_is_power_of_two(state_size));
 
    pool->state_size = state_size;
    pool->free_list = ANV_FREE_LIST_EMPTY;
@@ -475,7 +475,7 @@ anv_state_pool_alloc(struct anv_state_pool *pool, size_t size, size_t align)
 void
 anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
 {
-   assert(is_power_of_two(state.alloc_size));
+   assert(util_is_power_of_two(state.alloc_size));
    unsigned size_log2 = ilog2_round_up(state.alloc_size);
    assert(size_log2 >= ANV_MIN_STATE_SIZE_LOG2 &&
           size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
diff --git a/src/vulkan/anv_compiler.cpp b/src/vulkan/anv_compiler.cpp
index ff32b071af2..258abfb52be 100644
--- a/src/vulkan/anv_compiler.cpp
+++ b/src/vulkan/anv_compiler.cpp
@@ -166,7 +166,6 @@ really_do_vs_prog(struct brw_context *brw,
 {
    GLuint program_size;
    const GLuint *program;
-   struct brw_vs_compile c;
    struct brw_vs_prog_data *prog_data = &pipeline->vs_prog_data;
    struct brw_stage_prog_data *stage_prog_data = &prog_data->base.base;
    void *mem_ctx;
@@ -175,14 +174,10 @@ really_do_vs_prog(struct brw_context *brw,
    if (prog)
       vs = prog->_LinkedShaders[MESA_SHADER_VERTEX];
 
-   memset(&c, 0, sizeof(c));
-   memcpy(&c.key, key, sizeof(*key));
    memset(prog_data, 0, sizeof(*prog_data));
 
    mem_ctx = ralloc_context(NULL);
 
-   c.vp = vp;
-
    /* Allocate the references to the uniforms that will end up in the
     * prog_data associated with the compiled program, and which will be freed
     * by the state cache.
@@ -201,7 +196,7 @@ really_do_vs_prog(struct brw_context *brw,
    /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
     * planes as uniforms.
     */
-   param_count += c.key.base.nr_userclip_plane_consts * 4;
+   param_count += key->base.nr_userclip_plane_consts * 4;
 
    /* Setting nr_params here NOT to the size of the param and pull_param
     * arrays, but to the number of uniform components vec4_visitor
@@ -215,7 +210,7 @@ really_do_vs_prog(struct brw_context *brw,
    GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
    prog_data->inputs_read = vp->program.Base.InputsRead;
 
-   if (c.key.copy_edgeflag) {
+   if (key->copy_edgeflag) {
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
       prog_data->inputs_read |= VERT_BIT_EDGEFLAG;
    }
@@ -228,7 +223,7 @@ really_do_vs_prog(struct brw_context *brw,
        * coords, which would be a pain to handle.
        */
       for (int i = 0; i < 8; i++) {
-         if (c.key.point_coord_replace & (1 << i))
+         if (key->point_coord_replace & (1 << i))
             outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
       }
 
@@ -243,7 +238,7 @@ really_do_vs_prog(struct brw_context *brw,
     * distance varying slots whenever clipping is enabled, even if the vertex
     * shader doesn't write to gl_ClipDistance.
     */
-   if (c.key.base.userclip_active) {
+   if (key->base.userclip_active) {
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
    }
@@ -256,7 +251,8 @@ really_do_vs_prog(struct brw_context *brw,
 
    /* Emit GEN4 code.
     */
-   program = brw_vs_emit(brw, prog, &c, prog_data, mem_ctx, &program_size);
+   program = brw_vs_emit(brw, mem_ctx, key, prog_data, &vp->program,
+                         prog, &program_size);
    if (program == NULL) {
       ralloc_free(mem_ctx);
       return false;
@@ -1009,7 +1005,7 @@ anv_compile_shader_spirv(struct anv_compiler *compiler,
 
    brw_process_nir(mesa_shader->Program->nir,
                    compiler->screen->devinfo,
-                   NULL, mesa_shader->Stage);
+                   NULL, mesa_shader->Stage, false);
 
    setup_nir_io(mesa_shader->Program, mesa_shader->Program->nir);
 
diff --git a/src/vulkan/anv_device.c b/src/vulkan/anv_device.c
index 7eed78c660e..76381e615d3 100644
--- a/src/vulkan/anv_device.c
+++ b/src/vulkan/anv_device.c
@@ -29,6 +29,7 @@
 
 #include "anv_private.h"
 #include "mesa/main/git_sha1.h"
+#include "util/strtod.h"
 
 static int
 anv_env_get_int(const char *name)
@@ -142,6 +143,8 @@ VkResult anv_CreateInstance(
    instance->apiVersion = pCreateInfo->pAppInfo->apiVersion;
    instance->physicalDeviceCount = 0;
 
+   _mesa_locale_init();
+
    VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
 
    *pInstance = anv_instance_to_handle(instance);
@@ -156,6 +159,8 @@ VkResult anv_DestroyInstance(
 
    VG(VALGRIND_DESTROY_MEMPOOL(instance));
 
+   _mesa_locale_fini();
+
    instance->pfnFree(instance->pAllocUserData, instance);
 
    return VK_SUCCESS;
diff --git a/src/vulkan/anv_util.c b/src/vulkan/anv_util.c
index 820356675c7..0311fbcd84f 100644
--- a/src/vulkan/anv_util.c
+++ b/src/vulkan/anv_util.c
@@ -85,8 +85,8 @@ anv_abortfv(const char *format, va_list va)
 int
 anv_vector_init(struct anv_vector *vector, uint32_t element_size, uint32_t size)
 {
-   assert(is_power_of_two(size));
-   assert(element_size < size && is_power_of_two(element_size));
+   assert(util_is_power_of_two(size));
+   assert(element_size < size && util_is_power_of_two(element_size));
 
    vector->head = 0;
    vector->tail = 0;